gsdx: split GSVector.h into smaller files

2017-03-18 09:58:01 +01:00 · 2017-03-18 09:58:01 +01:00 · 9865270e68
parent f3a89f59e4
commit 9865270e68
8 changed files with 6011 additions and 5905 deletions
--- a/plugins/GSdx/CMakeLists.txt
+++ b/plugins/GSdx/CMakeLists.txt
@ -175,6 +175,10 @@ set(GSdxHeaders
    GSUniformBufferOGL.h
    GSUtil.h
    GSVector.h
+    GSVector4.h
+    GSVector4i.h
+    GSVector8.h
+    GSVector8i.h
    GSVertexArrayOGL.h
    GSVertex.h
    GSVertexHW.h
--- a/plugins/GSdx/GSVector.h
+++ b/plugins/GSdx/GSVector.h
--- a/plugins/GSdx/GSVector4.h
+++ b/plugins/GSdx/GSVector4.h
@ -0,0 +1,960 @@
+/*
+ *	Copyright (C) 2007-2017 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+class alignas(16) GSVector4
+{
+public:
+	union
+	{
+		struct {float x, y, z, w;};
+		struct {float r, g, b, a;};
+		struct {float left, top, right, bottom;};
+		float v[4];
+		float f32[4];
+		int8 i8[16];
+		int16 i16[8];
+		int32 i32[4];
+		int64 i64[2];
+		uint8 u8[16];
+		uint16 u16[8];
+		uint32 u32[4];
+		uint64 u64[2];
+		__m128 m;
+	};
+
+	static GSVector4 m_ps0123;
+	static GSVector4 m_ps4567;
+	static GSVector4 m_half;
+	static GSVector4 m_one;
+	static GSVector4 m_two;
+	static GSVector4 m_four;
+	static GSVector4 m_x4b000000;
+	static GSVector4 m_x4f800000;
+	static GSVector4 m_max;
+	static GSVector4 m_min;
+
+	static void InitVectors();
+
+	__forceinline GSVector4()
+	{
+	}
+
+	__forceinline GSVector4(float x, float y, float z, float w)
+	{
+		m = _mm_set_ps(w, z, y, x);
+	}
+
+	__forceinline GSVector4(float x, float y)
+	{
+		m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y));
+	}
+
+	__forceinline GSVector4(int x, int y, int z, int w)
+	{
+		GSVector4i v(x, y, z, w);
+
+		m = _mm_cvtepi32_ps(v.m);
+	}
+
+	__forceinline GSVector4(int x, int y)
+	{
+		m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
+	}
+
+	//Not currently used, just causes a compiler warning
+	/*__forceinline GSVector4(const GSVector4& v)
+	{
+		m = v.m;
+	}*/
+
+	__forceinline explicit GSVector4(const GSVector2& v)
+	{
+		m = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&v));
+	}
+
+	__forceinline explicit GSVector4(const GSVector2i& v)
+	{
+		m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v));
+	}
+
+	__forceinline explicit GSVector4(__m128 m)
+	{
+		this->m = m;
+	}
+
+	__forceinline explicit GSVector4(float f)
+	{
+		*this = f;
+	}
+
+	__forceinline explicit GSVector4(int i)
+	{
+		#if _M_SSE >= 0x501
+
+		m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
+
+		#else
+
+		GSVector4i v((int)i);
+
+		*this = GSVector4(v);
+
+		#endif
+	}
+	
+	__forceinline explicit GSVector4(uint32 u)
+	{
+		GSVector4i v((int)u);
+
+		*this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31)));
+	}
+
+	__forceinline explicit GSVector4(const GSVector4i& v);
+
+	__forceinline static GSVector4 cast(const GSVector4i& v);
+
+	#if _M_SSE >= 0x500
+
+	__forceinline static GSVector4 cast(const GSVector8& v);
+
+	#endif
+
+	#if _M_SSE >= 0x501
+
+	__forceinline static GSVector4 cast(const GSVector8i& v);
+
+	#endif
+
+	__forceinline void operator = (const GSVector4& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline void operator = (float f)
+	{
+		#if _M_SSE >= 0x501
+
+		m =  _mm_broadcastss_ps(_mm_load_ss(&f));
+
+		#else
+
+		m = _mm_set1_ps(f);
+
+		#endif
+	}
+
+	__forceinline void operator = (__m128 m)
+	{
+		this->m = m;
+	}
+
+	__forceinline operator __m128() const
+	{
+		return m;
+	}
+
+	__forceinline uint32 rgba32() const
+	{
+		return GSVector4i(*this).rgba32();
+	}
+
+	__forceinline static GSVector4 rgba32(uint32 rgba)
+	{
+		return GSVector4(GSVector4i::load((int)rgba).u8to32());
+	}
+
+	__forceinline static GSVector4 rgba32(uint32 rgba, int shift)
+	{
+		return GSVector4(GSVector4i::load((int)rgba).u8to32() << shift);
+	}
+
+	__forceinline GSVector4 abs() const
+	{
+		return *this & cast(GSVector4i::x7fffffff());
+	}
+
+	__forceinline GSVector4 neg() const
+	{
+		return *this ^ cast(GSVector4i::x80000000());
+	}
+
+	__forceinline GSVector4 rcp() const
+	{
+		return GSVector4(_mm_rcp_ps(m));
+	}
+
+	__forceinline GSVector4 rcpnr() const
+	{
+		GSVector4 v = rcp();
+
+		return (v + v) - (v * v) * *this;
+	}
+
+	template<int mode> __forceinline GSVector4 round() const
+	{
+		#if _M_SSE >= 0x401
+
+		return GSVector4(_mm_round_ps(m, mode));
+
+		#else
+
+		GSVector4 a = *this;
+
+		GSVector4 b = (a & cast(GSVector4i::x80000000())) | m_x4b000000;
+
+		b = a + b - b;
+
+		if((mode & 7) == (Round_NegInf & 7))
+		{
+			return b - ((a < b) & m_one);
+		}
+
+		if((mode & 7) == (Round_PosInf & 7))
+		{
+			return b + ((a > b) & m_one);
+		}
+
+		ASSERT((mode & 7) == (Round_NearestInt & 7)); // other modes aren't implemented
+
+		return b;
+
+		#endif
+	}
+
+	__forceinline GSVector4 floor() const
+	{
+		return round<Round_NegInf>();
+	}
+
+	__forceinline GSVector4 ceil() const
+	{
+		return round<Round_PosInf>();
+	}
+
+	// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+
+	#define LOG_POLY0(x, c0) GSVector4(c0)
+	#define LOG_POLY1(x, c0, c1) (LOG_POLY0(x, c1).madd(x, GSVector4(c0)))
+	#define LOG_POLY2(x, c0, c1, c2) (LOG_POLY1(x, c1, c2).madd(x, GSVector4(c0)))
+	#define LOG_POLY3(x, c0, c1, c2, c3) (LOG_POLY2(x, c1, c2, c3).madd(x, GSVector4(c0)))
+	#define LOG_POLY4(x, c0, c1, c2, c3, c4) (LOG_POLY3(x, c1, c2, c3, c4).madd(x, GSVector4(c0)))
+	#define LOG_POLY5(x, c0, c1, c2, c3, c4, c5) (LOG_POLY4(x, c1, c2, c3, c4, c5).madd(x, GSVector4(c0)))
+
+	__forceinline GSVector4 log2(int precision = 5) const
+	{
+		// NOTE: sign bit ignored, safe to pass negative numbers
+
+		// The idea behind this algorithm is to split the float into two parts, log2(m * 2^e) => log2(m) + log2(2^e) => log2(m) + e, 
+		// and then approximate the logarithm of the mantissa (it's 1.x when normalized, a nice short range).
+
+		GSVector4 one = m_one;
+
+		GSVector4i i = GSVector4i::cast(*this);
+
+		GSVector4 e = GSVector4(((i << 1) >> 24) - GSVector4i::x0000007f());
+		GSVector4 m = GSVector4::cast((i << 9) >> 9) | one;
+
+		GSVector4 p;
+
+		// Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
+
+		switch(precision)
+		{
+		case 3:
+			p = LOG_POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+			break;
+		case 4:
+			p = LOG_POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+			break;
+		default:
+		case 5:
+			p = LOG_POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+			break;
+		case 6:
+			p = LOG_POLY5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+			break;
+		}
+
+		// This effectively increases the polynomial degree by one, but ensures that log2(1) == 0
+
+		p = p * (m - one);
+
+		return p + e;
+	}
+
+	__forceinline GSVector4 madd(const GSVector4& a, const GSVector4& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector4(_mm_fmadd_ps(m, a, b));
+		
+		#else
+		
+		return *this * a + b;
+		
+		#endif
+	}
+
+	__forceinline GSVector4 msub(const GSVector4& a, const GSVector4& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector4(_mm_fmsub_ps(m, a, b));
+		
+		#else
+		
+		return *this * a - b;
+		
+		#endif
+	}
+
+	__forceinline GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector4(_mm_fnmadd_ps(m, a, b));
+		
+		#else
+		
+		return b - *this * a;
+		
+		#endif
+	}
+
+	__forceinline GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector4(_mm_fnmsub_ps(m, a, b));
+		
+		#else
+
+		return -b - *this * a;
+
+		#endif
+	}
+
+	__forceinline GSVector4 addm(const GSVector4& a, const GSVector4& b) const
+	{
+		return a.madd(b, *this); // *this + a * b
+	}
+
+	__forceinline GSVector4 subm(const GSVector4& a, const GSVector4& b) const
+	{
+		return a.nmadd(b, *this); // *this - a * b
+	}
+
+	__forceinline GSVector4 hadd() const
+	{
+		#if _M_SSE >= 0x300
+		
+		return GSVector4(_mm_hadd_ps(m, m));
+		
+		#else
+		
+		return xzxz() + ywyw();
+		
+		#endif
+	}
+
+	__forceinline GSVector4 hadd(const GSVector4& v) const
+	{
+		#if _M_SSE >= 0x300
+		
+		return GSVector4(_mm_hadd_ps(m, v.m));
+		
+		#else
+		
+		return xzxz(v) + ywyw(v);
+		
+		#endif
+	}
+
+	__forceinline GSVector4 hsub() const
+	{
+		#if _M_SSE >= 0x300
+		
+		return GSVector4(_mm_hsub_ps(m, m));
+		
+		#else
+		
+		return xzxz() - ywyw();
+		
+		#endif
+	}
+
+	__forceinline GSVector4 hsub(const GSVector4& v) const
+	{
+		#if _M_SSE >= 0x300
+		
+		return GSVector4(_mm_hsub_ps(m, v.m));
+		
+		#else
+		
+		return xzxz(v) - ywyw(v);
+
+		#endif
+	}
+
+	#if _M_SSE >= 0x401
+
+	template<int i> __forceinline GSVector4 dp(const GSVector4& v) const
+	{
+		return GSVector4(_mm_dp_ps(m, v.m, i));
+	}
+
+	#endif
+
+	__forceinline GSVector4 sat(const GSVector4& a, const GSVector4& b) const
+	{
+		return GSVector4(_mm_min_ps(_mm_max_ps(m, a), b));
+	}
+
+	__forceinline GSVector4 sat(const GSVector4& a) const
+	{
+		return GSVector4(_mm_min_ps(_mm_max_ps(m, a.xyxy()), a.zwzw()));
+	}
+
+	__forceinline GSVector4 sat(const float scale = 255) const
+	{
+		return sat(zero(), GSVector4(scale));
+	}
+
+	__forceinline GSVector4 clamp(const float scale = 255) const
+	{
+		return min(GSVector4(scale));
+	}
+
+	__forceinline GSVector4 min(const GSVector4& a) const
+	{
+		return GSVector4(_mm_min_ps(m, a));
+	}
+
+	__forceinline GSVector4 max(const GSVector4& a) const
+	{
+		return GSVector4(_mm_max_ps(m, a));
+	}
+
+	#if _M_SSE >= 0x401
+
+	template<int mask> __forceinline GSVector4 blend32(const GSVector4& a)  const
+	{
+		return GSVector4(_mm_blend_ps(m, a, mask));
+	}
+
+	#endif
+
+	__forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask)  const
+	{
+		#if _M_SSE >= 0x401
+
+		return GSVector4(_mm_blendv_ps(m, a, mask));
+
+		#else
+
+		return GSVector4(_mm_or_ps(_mm_andnot_ps(mask, m), _mm_and_ps(mask, a)));
+
+		#endif
+	}
+
+	__forceinline GSVector4 upl(const GSVector4& a) const
+	{
+		return GSVector4(_mm_unpacklo_ps(m, a));
+	}
+
+	__forceinline GSVector4 uph(const GSVector4& a) const
+	{
+		return GSVector4(_mm_unpackhi_ps(m, a));
+	}
+
+	__forceinline GSVector4 upld(const GSVector4& a) const
+	{
+		return GSVector4(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m), _mm_castps_pd(a.m))));
+	}
+
+	__forceinline GSVector4 uphd(const GSVector4& a) const
+	{
+		return GSVector4(_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m), _mm_castps_pd(a.m))));
+	}
+
+	__forceinline GSVector4 l2h(const GSVector4& a) const
+	{
+		return GSVector4(_mm_movelh_ps(m, a));
+	}
+
+	__forceinline GSVector4 h2l(const GSVector4& a) const
+	{
+		return GSVector4(_mm_movehl_ps(m, a));
+	}
+
+	__forceinline GSVector4 andnot(const GSVector4& v) const
+	{
+		return GSVector4(_mm_andnot_ps(v.m, m));
+	}
+
+	__forceinline int mask() const
+	{
+		return _mm_movemask_ps(m);
+	}
+
+	__forceinline bool alltrue() const
+	{
+		return mask() == 0xf;
+	}
+
+	__forceinline bool allfalse() const
+	{
+		#if _M_SSE >= 0x500
+
+		return _mm_testz_ps(m, m) != 0;
+
+		#elif _M_SSE >= 0x401
+
+		__m128i a = _mm_castps_si128(m);
+
+		return _mm_testz_si128(a, a) != 0;
+
+		#else
+
+		return mask() == 0;
+
+		#endif
+	}
+
+	__forceinline GSVector4 replace_nan(const GSVector4& v) const
+	{
+		return v.blend32(*this, *this == *this);
+	}
+
+	template<int src, int dst> __forceinline GSVector4 insert32(const GSVector4& v) const
+	{
+		// TODO: use blendps when src == dst
+
+		#if 0 // _M_SSE >= 0x401
+
+		// NOTE: it's faster with shuffles...
+
+		return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0)));
+
+		#else
+
+		switch(dst)
+		{
+		case 0:
+			switch(src)
+			{
+			case 0: return yyxx(v).zxzw(*this);
+			case 1: return yyyy(v).zxzw(*this);
+			case 2: return yyzz(v).zxzw(*this);
+			case 3: return yyww(v).zxzw(*this);
+			default: __assume(0);
+			}
+			break;
+		case 1:
+			switch(src)
+			{
+			case 0: return xxxx(v).xzzw(*this);
+			case 1: return xxyy(v).xzzw(*this);
+			case 2: return xxzz(v).xzzw(*this);
+			case 3: return xxww(v).xzzw(*this);
+			default: __assume(0);
+			}
+			break;
+		case 2:
+			switch(src)
+			{
+			case 0: return xyzx(wwxx(v));
+			case 1: return xyzx(wwyy(v));
+			case 2: return xyzx(wwzz(v));
+			case 3: return xyzx(wwww(v));
+			default: __assume(0);
+			}
+			break;
+		case 3:
+			switch(src)
+			{
+			case 0: return xyxz(zzxx(v));
+			case 1: return xyxz(zzyy(v));
+			case 2: return xyxz(zzzz(v));
+			case 3: return xyxz(zzww(v));
+			default: __assume(0);
+			}
+			break;
+		default:
+			__assume(0);
+		}
+
+		#endif
+
+	}
+
+#ifdef __linux__
+#if 0
+	// Debug build error, _mm_extract_ps is actually a macro that use an anonymous union
+	// that contains i. I decide to rename the template on linux but it makes windows unhappy
+	// Hence the nice ifdef
+	//
+	// Code extract:
+	// union { int i; float f; } __tmp;
+
+GSVector.h:2977:40: error: declaration of 'int GSVector4::extract32() const::<anonymous union>::i'
+   return _mm_extract_ps(m, i);
+GSVector.h:2973:15: error:  shadows template parm 'int i'
+  template<int i> __forceinline int extract32() const
+#endif
+
+	template<int index> __forceinline int extract32() const
+	{
+		#if _M_SSE >= 0x401
+
+		return _mm_extract_ps(m, index);
+
+		#else
+
+		return i32[index];
+
+		#endif
+	}
+#else
+	template<int i> __forceinline int extract32() const
+	{
+		#if _M_SSE >= 0x401
+
+		return _mm_extract_ps(m, i);
+
+		#else
+
+		return i32[i];
+
+		#endif
+	}
+#endif
+
+	__forceinline static GSVector4 zero()
+	{
+		return GSVector4(_mm_setzero_ps());
+	}
+
+	__forceinline static GSVector4 xffffffff()
+	{
+		return zero() == zero();
+	}
+
+	__forceinline static GSVector4 ps0123()
+	{
+		return GSVector4(m_ps0123);
+	}
+
+	__forceinline static GSVector4 ps4567()
+	{
+		return GSVector4(m_ps4567);
+	}
+
+	__forceinline static GSVector4 loadl(const void* p)
+	{
+		return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p)));
+	}
+
+	__forceinline static GSVector4 load(float f)
+	{
+		return GSVector4(_mm_load_ss(&f));
+	}
+
+	__forceinline static GSVector4 load(uint32 u)
+	{
+		GSVector4i v = GSVector4i::load((int)u);
+
+		return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31)));
+	}
+
+	template<bool aligned> __forceinline static GSVector4 load(const void* p)
+	{
+		return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p));
+	}
+
+	__forceinline static void storent(void* p, const GSVector4& v)
+	{
+		_mm_stream_ps((float*)p, v.m);
+	}
+
+	__forceinline static void storel(void* p, const GSVector4& v)
+	{
+		_mm_store_sd((double*)p, _mm_castps_pd(v.m));
+	}
+
+	__forceinline static void storeh(void* p, const GSVector4& v)
+	{
+		_mm_storeh_pd((double*)p, _mm_castps_pd(v.m));
+	}
+
+	template<bool aligned> __forceinline static void store(void* p, const GSVector4& v)
+	{
+		if(aligned) _mm_store_ps((float*)p, v.m);
+		else _mm_storeu_ps((float*)p, v.m);
+	}
+
+	__forceinline static void store(float* p, const GSVector4& v)
+	{
+		_mm_store_ss(p, v.m);
+	}
+
+	__forceinline static void expand(const GSVector4i& v, GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
+	{
+		GSVector4i mask = GSVector4i::x000000ff();
+
+		a = GSVector4(v & mask);
+		b = GSVector4((v >> 8) & mask);
+		c = GSVector4((v >> 16) & mask);
+		d = GSVector4((v >> 24));
+	}
+
+	__forceinline static void transpose(GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
+	{
+		GSVector4 v0 = a.xyxy(b);
+		GSVector4 v1 = c.xyxy(d);
+
+		GSVector4 e = v0.xzxz(v1);
+		GSVector4 f = v0.ywyw(v1);
+
+		GSVector4 v2 = a.zwzw(b);
+		GSVector4 v3 = c.zwzw(d);
+
+		GSVector4 g = v2.xzxz(v3);
+		GSVector4 h = v2.ywyw(v3);
+
+		a = e;
+		b = f;
+		c = g;
+		d = h;
+/*
+		GSVector4 v0 = a.xyxy(b);
+		GSVector4 v1 = c.xyxy(d);
+		GSVector4 v2 = a.zwzw(b);
+		GSVector4 v3 = c.zwzw(d);
+
+		a = v0.xzxz(v1);
+		b = v0.ywyw(v1);
+		c = v2.xzxz(v3);
+		d = v2.ywyw(v3);
+*/
+/*
+		GSVector4 v0 = a.upl(b);
+		GSVector4 v1 = a.uph(b);
+		GSVector4 v2 = c.upl(d);
+		GSVector4 v3 = c.uph(d);
+
+		a = v0.l2h(v2);
+		b = v2.h2l(v0);
+		c = v1.l2h(v3);
+		d = v3.h2l(v1);
+*/	}
+
+	__forceinline GSVector4 operator - () const
+	{
+		return neg();
+	}
+
+	__forceinline void operator += (const GSVector4& v)
+	{
+		m = _mm_add_ps(m, v);
+	}
+
+	__forceinline void operator -= (const GSVector4& v)
+	{
+		m = _mm_sub_ps(m, v);
+	}
+
+	__forceinline void operator *= (const GSVector4& v)
+	{
+		m = _mm_mul_ps(m, v);
+	}
+
+	__forceinline void operator /= (const GSVector4& v)
+	{
+		m = _mm_div_ps(m, v);
+	}
+
+	__forceinline void operator += (float f)
+	{
+		*this += GSVector4(f);
+	}
+
+	__forceinline void operator -= (float f)
+	{
+		*this -= GSVector4(f);
+	}
+
+	__forceinline void operator *= (float f)
+	{
+		*this *= GSVector4(f);
+	}
+
+	__forceinline void operator /= (float f)
+	{
+		*this /= GSVector4(f);
+	}
+
+	__forceinline void operator &= (const GSVector4& v)
+	{
+		m = _mm_and_ps(m, v);
+	}
+
+	__forceinline void operator |= (const GSVector4& v)
+	{
+		m = _mm_or_ps(m, v);
+	}
+
+	__forceinline void operator ^= (const GSVector4& v)
+	{
+		m = _mm_xor_ps(m, v);
+	}
+
+	__forceinline friend GSVector4 operator + (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_add_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator - (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_sub_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator * (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_mul_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator / (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_div_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator + (const GSVector4& v, float f)
+	{
+		return v + GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator - (const GSVector4& v, float f)
+	{
+		return v - GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator * (const GSVector4& v, float f)
+	{
+		return v * GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator / (const GSVector4& v, float f)
+	{
+		return v / GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator & (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_and_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator | (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_or_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator ^ (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_xor_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator == (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmpeq_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator != (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmpneq_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator > (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmpgt_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator < (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmplt_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator >= (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmpge_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator <= (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmple_ps(v1, v2));
+	}
+
+	#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector4 xs##ys##zs##ws() const {return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4 xs##ys##zs##ws(const GSVector4& v) const {return GSVector4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+
+	#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+	#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+	#define VECTOR4_SHUFFLE_1(xs, xn) \
+		VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
+		VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
+		VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
+		VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+
+	VECTOR4_SHUFFLE_1(x, 0)
+	VECTOR4_SHUFFLE_1(y, 1)
+	VECTOR4_SHUFFLE_1(z, 2)
+	VECTOR4_SHUFFLE_1(w, 3)
+
+	#if _M_SSE >= 0x501
+
+	__forceinline GSVector4 broadcast32() const
+	{
+		return GSVector4(_mm_broadcastss_ps(m));
+	}
+
+	__forceinline static GSVector4 broadcast32(const GSVector4& v)
+	{
+		return GSVector4(_mm_broadcastss_ps(v.m));
+	}
+
+	__forceinline static GSVector4 broadcast32(const void* f)
+	{
+		return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f)));
+	}
+
+	#endif
+};
--- a/plugins/GSdx/GSVector4i.h
+++ b/plugins/GSdx/GSVector4i.h
--- a/plugins/GSdx/GSVector8.h
+++ b/plugins/GSdx/GSVector8.h
@ -0,0 +1,854 @@
+/*
+ *	Copyright (C) 2007-2017 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#if _M_SSE >= 0x500
+
+class alignas(32) GSVector8
+{
+public:
+	union
+	{
+		struct {float x0, y0, z0, w0, x1, y1, z1, w1;};
+		struct {float r0, g0, b0, a0, r1, g1, b1, a1;};
+		float v[8];
+		float f32[8];
+		int8 i8[32];
+		int16 i16[16];
+		int32 i32[8];
+		int64 i64[4];
+		uint8 u8[32];
+		uint16 u16[16];
+		uint32 u32[8];
+		uint64 u64[4];
+		__m256 m;
+		__m128 m0, m1;
+	};
+
+	static GSVector8 m_half;
+	static GSVector8 m_one;
+	static GSVector8 m_x7fffffff;
+	static GSVector8 m_x80000000;
+	static GSVector8 m_x4b000000;
+	static GSVector8 m_x4f800000;
+	static GSVector8 m_max;
+	static GSVector8 m_min;
+
+	static void InitVectors();
+
+	__forceinline GSVector8() 
+	{
+	}
+
+	__forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1)
+	{
+		m = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0);
+	}
+
+	__forceinline GSVector8(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
+	{
+		m = _mm256_cvtepi32_ps(_mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0));
+	}
+
+	__forceinline GSVector8(__m128 m0, __m128 m1)
+	{
+		#if 0 // _MSC_VER >= 1700 
+		
+		this->m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m0), _mm256_castps128_ps256(m1), 0x20);
+
+		#else
+
+		this->m = zero().insert<0>(m0).insert<1>(m1);
+
+		#endif
+	}
+
+	__forceinline GSVector8(const GSVector8& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline explicit GSVector8(float f)
+	{
+		*this = f;
+	}
+
+	__forceinline explicit GSVector8(int i)
+	{
+		#if _M_SSE >= 0x501
+
+		m = _mm256_cvtepi32_ps(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)));
+
+		#else 
+
+		GSVector4i v((int)i);
+
+		*this = GSVector4(v);
+
+		#endif
+	}
+
+	__forceinline explicit GSVector8(__m128 m)
+	{
+		*this = m;
+	}
+
+	__forceinline explicit GSVector8(__m256 m)
+	{
+		this->m = m;
+	}
+
+	#if _M_SSE >= 0x501
+
+	__forceinline explicit GSVector8(const GSVector8i& v);
+
+	__forceinline static GSVector8 cast(const GSVector8i& v);
+
+	#endif
+
+	__forceinline static GSVector8 cast(const GSVector4& v);
+	__forceinline static GSVector8 cast(const GSVector4i& v);
+
+	__forceinline void operator = (const GSVector8& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline void operator = (float f)
+	{
+		#if _M_SSE >= 0x501
+
+		m =  _mm256_broadcastss_ps(_mm_load_ss(&f));
+
+		#else
+
+		m = _mm256_set1_ps(f);
+
+		#endif
+	}
+
+	__forceinline void operator = (__m128 m)
+	{
+		this->m = _mm256_insertf128_ps(_mm256_castps128_ps256(m), m, 1);
+	}
+
+	__forceinline void operator = (__m256 m)
+	{
+		this->m = m;
+	}
+
+	__forceinline operator __m256() const
+	{
+		return m;
+	}
+
+	__forceinline GSVector8 abs() const
+	{
+		#if _M_SSE >= 0x501
+
+		return *this & cast(GSVector8i::x7fffffff());
+
+		#else
+		
+		return *this & m_x7fffffff;
+
+		#endif
+	}
+
+	__forceinline GSVector8 neg() const
+	{
+		#if _M_SSE >= 0x501
+
+		return *this ^ cast(GSVector8i::x80000000());
+
+		#else
+		
+		return *this ^ m_x80000000;
+
+		#endif
+	}
+
+	__forceinline GSVector8 rcp() const
+	{
+		return GSVector8(_mm256_rcp_ps(m));
+	}
+
+	__forceinline GSVector8 rcpnr() const
+	{
+		GSVector8 v = rcp();
+
+		return (v + v) - (v * v) * *this;
+	}
+
+	template<int mode> __forceinline GSVector8 round() const
+	{
+		return GSVector8(_mm256_round_ps(m, mode));
+	}
+
+	__forceinline GSVector8 floor() const
+	{
+		return round<Round_NegInf>();
+	}
+
+	__forceinline GSVector8 ceil() const
+	{
+		return round<Round_PosInf>();
+	}
+
+	#if _M_SSE >= 0x501
+
+	#define LOG8_POLY0(x, c0) GSVector8(c0)
+	#define LOG8_POLY1(x, c0, c1) (LOG8_POLY0(x, c1).madd(x, GSVector8(c0)))
+	#define LOG8_POLY2(x, c0, c1, c2) (LOG8_POLY1(x, c1, c2).madd(x, GSVector8(c0)))
+	#define LOG8_POLY3(x, c0, c1, c2, c3) (LOG8_POLY2(x, c1, c2, c3).madd(x, GSVector8(c0)))
+	#define LOG8_POLY4(x, c0, c1, c2, c3, c4) (LOG8_POLY3(x, c1, c2, c3, c4).madd(x, GSVector8(c0)))
+	#define LOG8_POLY5(x, c0, c1, c2, c3, c4, c5) (LOG8_POLY4(x, c1, c2, c3, c4, c5).madd(x, GSVector8(c0)))
+
+	__forceinline GSVector8 log2(int precision = 5) const
+	{
+		// NOTE: see GSVector4::log2
+
+		GSVector8 one = m_one;
+
+		GSVector8i i = GSVector8i::cast(*this);
+
+		GSVector8 e = GSVector8(((i << 1) >> 24) - GSVector8i::x0000007f());
+		GSVector8 m = GSVector8::cast((i << 9) >> 9) | one;
+
+		GSVector8 p;
+
+		switch(precision)
+		{
+		case 3:
+			p = LOG8_POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+			break;
+		case 4:
+			p = LOG8_POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+			break;
+		default:
+		case 5:
+			p = LOG8_POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+			break;
+		case 6:
+			p = LOG8_POLY5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+			break;
+		}
+
+		// This effectively increases the polynomial degree by one, but ensures that log2(1) == 0
+
+		p = p * (m - one);
+
+		return p + e;
+	}
+
+	#endif
+
+	__forceinline GSVector8 madd(const GSVector8& a, const GSVector8& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector8(_mm256_fmadd_ps(m, a, b));
+		
+		#else
+		
+		return *this * a + b;
+		
+		#endif
+	}
+
+	__forceinline GSVector8 msub(const GSVector8& a, const GSVector8& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector8(_mm256_fmsub_ps(m, a, b));
+		
+		#else
+		
+		return *this * a - b;
+		
+		#endif
+	}
+
+	__forceinline GSVector8 nmadd(const GSVector8& a, const GSVector8& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector8(_mm256_fnmadd_ps(m, a, b));
+		
+		#else
+		
+		return b - *this * a;
+		
+		#endif
+	}
+
+	__forceinline GSVector8 nmsub(const GSVector8& a, const GSVector8& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector8(_mm256_fnmsub_ps(m, a, b));
+		
+		#else
+
+		return -b - *this * a;
+
+		#endif
+	}
+
+	__forceinline GSVector8 addm(const GSVector8& a, const GSVector8& b) const
+	{
+		return a.madd(b, *this); // *this + a * b
+	}
+
+	__forceinline GSVector8 subm(const GSVector8& a, const GSVector8& b) const
+	{
+		return a.nmadd(b, *this); // *this - a * b
+	}
+
+	__forceinline GSVector8 hadd() const
+	{
+		return GSVector8(_mm256_hadd_ps(m, m));
+	}
+
+	__forceinline GSVector8 hadd(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_hadd_ps(m, v.m));
+	}
+
+	__forceinline GSVector8 hsub() const
+	{
+		return GSVector8(_mm256_hsub_ps(m, m));
+	}
+
+	__forceinline GSVector8 hsub(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_hsub_ps(m, v.m));
+	}
+
+	template<int i> __forceinline GSVector8 dp(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_dp_ps(m, v.m, i));
+	}
+
+	__forceinline GSVector8 sat(const GSVector8& a, const GSVector8& b) const
+	{
+		return GSVector8(_mm256_min_ps(_mm256_max_ps(m, a), b));
+	}
+
+	__forceinline GSVector8 sat(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_min_ps(_mm256_max_ps(m, a.xyxy()), a.zwzw()));
+	}
+
+	__forceinline GSVector8 sat(const float scale = 255) const
+	{
+		return sat(zero(), GSVector8(scale));
+	}
+
+	__forceinline GSVector8 clamp(const float scale = 255) const
+	{
+		return min(GSVector8(scale));
+	}
+
+	__forceinline GSVector8 min(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_min_ps(m, a));
+	}
+
+	__forceinline GSVector8 max(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_max_ps(m, a));
+	}
+
+	template<int mask> __forceinline GSVector8 blend32(const GSVector8& a)  const
+	{
+		return GSVector8(_mm256_blend_ps(m, a, mask));
+	}
+
+	__forceinline GSVector8 blend32(const GSVector8& a, const GSVector8& mask)  const
+	{
+		return GSVector8(_mm256_blendv_ps(m, a, mask));
+	}
+
+	__forceinline GSVector8 upl(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_unpacklo_ps(m, a));
+	}
+
+	__forceinline GSVector8 uph(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_unpackhi_ps(m, a));
+	}
+
+	__forceinline GSVector8 upl64(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(m), _mm256_castps_pd(a))));
+	}
+
+	__forceinline GSVector8 uph64(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(m), _mm256_castps_pd(a))));
+	}
+
+	__forceinline GSVector8 l2h() const
+	{
+		return xyxy();
+	}
+
+	__forceinline GSVector8 h2l() const
+	{
+		return zwzw();
+	}
+
+	__forceinline GSVector8 andnot(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_andnot_ps(v.m, m));
+	}
+
+	__forceinline int mask() const
+	{
+		return _mm256_movemask_ps(m);
+	}
+
+	__forceinline bool alltrue() const
+	{
+		return mask() == 0xff;
+	}
+
+	__forceinline bool allfalse() const
+	{
+		return _mm256_testz_ps(m, m) != 0;
+	}
+	
+	__forceinline GSVector8 replace_nan(const GSVector8& v) const
+	{
+		return v.blend32(*this, *this == *this);
+	}
+
+	template<int src, int dst> __forceinline GSVector8 insert32(const GSVector8& v) const
+	{
+		// TODO: use blendps when src == dst
+
+		ASSERT(src < 4 && dst < 4); // not cross lane like extract32()
+
+		switch(dst)
+		{
+		case 0:
+			switch(src)
+			{
+			case 0: return yyxx(v).zxzw(*this);
+			case 1: return yyyy(v).zxzw(*this);
+			case 2: return yyzz(v).zxzw(*this);
+			case 3: return yyww(v).zxzw(*this);
+			default: __assume(0);
+			}
+			break;
+		case 1:
+			switch(src)
+			{
+			case 0: return xxxx(v).xzzw(*this);
+			case 1: return xxyy(v).xzzw(*this);
+			case 2: return xxzz(v).xzzw(*this);
+			case 3: return xxww(v).xzzw(*this);
+			default: __assume(0);
+			}
+			break;
+		case 2:
+			switch(src)
+			{
+			case 0: return xyzx(wwxx(v));
+			case 1: return xyzx(wwyy(v));
+			case 2: return xyzx(wwzz(v));
+			case 3: return xyzx(wwww(v));
+			default: __assume(0);
+			}
+			break;
+		case 3:
+			switch(src)
+			{
+			case 0: return xyxz(zzxx(v));
+			case 1: return xyxz(zzyy(v));
+			case 2: return xyxz(zzzz(v));
+			case 3: return xyxz(zzww(v));
+			default: __assume(0);
+			}
+			break;
+		default:
+			__assume(0);
+		}
+
+		return *this;
+	}
+
+	template<int i> __forceinline int extract32() const
+	{
+		ASSERT(i < 8);
+
+		return extract<i / 4>().template extract32<i & 3>();
+	}
+
+	template<int i> __forceinline GSVector8 insert(__m128 m) const
+	{
+		ASSERT(i < 2);
+
+		return GSVector8(_mm256_insertf128_ps(this->m, m, i));
+	}
+
+	template<int i> __forceinline GSVector4 extract() const
+	{
+		ASSERT(i < 2);
+
+		if(i == 0) return GSVector4(_mm256_castps256_ps128(m));
+
+		return GSVector4(_mm256_extractf128_ps(m, i));
+	}
+
+	__forceinline static GSVector8 zero()
+	{
+		return GSVector8(_mm256_setzero_ps());
+	}
+
+	__forceinline static GSVector8 xffffffff()
+	{
+		return zero() == zero();
+	}
+
+	// TODO
+
+	__forceinline static GSVector8 loadl(const void* p)
+	{
+		return GSVector8(_mm256_castps128_ps256(_mm_load_ps((float*)p)));
+	}
+
+	__forceinline static GSVector8 loadh(const void* p)
+	{
+		return zero().insert<1>(_mm_load_ps((float*)p));
+	}
+
+	__forceinline static GSVector8 loadh(const void* p, const GSVector8& v)
+	{
+		return GSVector8(_mm256_insertf128_ps(v, _mm_load_ps((float*)p), 1));
+	}
+
+	__forceinline static GSVector8 load(const void* pl, const void* ph)
+	{
+		return loadh(ph, loadl(pl));
+	}
+
+	template<bool aligned> __forceinline static GSVector8 load(const void* p)
+	{
+		return GSVector8(aligned ? _mm256_load_ps((const float*)p) : _mm256_loadu_ps((const float*)p));
+	}
+
+	// TODO
+
+	__forceinline static void storel(void* p, const GSVector8& v)
+	{
+		_mm_store_ps((float*)p, _mm256_extractf128_ps(v.m, 0));
+	}
+
+	__forceinline static void storeh(void* p, const GSVector8& v)
+	{
+		_mm_store_ps((float*)p, _mm256_extractf128_ps(v.m, 1));
+	}
+
+	template<bool aligned> __forceinline static void store(void* p, const GSVector8& v)
+	{
+		if(aligned) _mm256_store_ps((float*)p, v.m);
+		else _mm256_storeu_ps((float*)p, v.m);
+	}
+
+	//
+
+	__forceinline static void zeroupper()
+	{
+		_mm256_zeroupper();
+	}
+
+	__forceinline static void zeroall()
+	{
+		_mm256_zeroall();
+	}
+
+	//
+
+	__forceinline GSVector8 operator - () const
+	{
+		return neg();
+	}
+
+	__forceinline void operator += (const GSVector8& v)
+	{
+		m = _mm256_add_ps(m, v);
+	}
+
+	__forceinline void operator -= (const GSVector8& v)
+	{
+		m = _mm256_sub_ps(m, v);
+	}
+
+	__forceinline void operator *= (const GSVector8& v)
+	{
+		m = _mm256_mul_ps(m, v);
+	}
+
+	__forceinline void operator /= (const GSVector8& v)
+	{
+		m = _mm256_div_ps(m, v);
+	}
+
+	__forceinline void operator += (float f)
+	{
+		*this += GSVector8(f);
+	}
+
+	__forceinline void operator -= (float f)
+	{
+		*this -= GSVector8(f);
+	}
+
+	__forceinline void operator *= (float f)
+	{
+		*this *= GSVector8(f);
+	}
+
+	__forceinline void operator /= (float f)
+	{
+		*this /= GSVector8(f);
+	}
+
+	__forceinline void operator &= (const GSVector8& v)
+	{
+		m = _mm256_and_ps(m, v);
+	}
+
+	__forceinline void operator |= (const GSVector8& v)
+	{
+		m = _mm256_or_ps(m, v);
+	}
+
+	__forceinline void operator ^= (const GSVector8& v)
+	{
+		m = _mm256_xor_ps(m, v);
+	}
+
+	__forceinline friend GSVector8 operator + (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_add_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator - (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_sub_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator * (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_mul_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator / (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_div_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator + (const GSVector8& v, float f)
+	{
+		return v + GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator - (const GSVector8& v, float f)
+	{
+		return v - GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator * (const GSVector8& v, float f)
+	{
+		return v * GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator / (const GSVector8& v, float f)
+	{
+		return v / GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator & (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_and_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator | (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_or_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator ^ (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_xor_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator == (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_EQ_OQ));
+	}
+
+	__forceinline friend GSVector8 operator != (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_NEQ_OQ));
+	}
+
+	__forceinline friend GSVector8 operator > (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_GT_OQ));
+	}
+
+	__forceinline friend GSVector8 operator < (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LT_OQ));
+	}
+
+	__forceinline friend GSVector8 operator >= (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_GE_OQ));
+	}
+
+	__forceinline friend GSVector8 operator <= (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ));
+	}
+
+	// x = v[31:0] / v[159:128]
+	// y = v[63:32] / v[191:160]
+	// z = v[95:64] / v[223:192]
+	// w = v[127:96] / v[255:224]
+
+
+	#define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+
+		// vs2012u3 cannot reuse the result of equivalent shuffles when it is done with _mm256_permute_ps (write v.xxxx() twice, and it will do it twice), but with _mm256_shuffle_ps it can.
+		//__forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+
+	#define VECTOR8_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+	#define VECTOR8_SHUFFLE_2(xs, xn, ys, yn) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+	#define VECTOR8_SHUFFLE_1(xs, xn) \
+		VECTOR8_SHUFFLE_2(xs, xn, x, 0) \
+		VECTOR8_SHUFFLE_2(xs, xn, y, 1) \
+		VECTOR8_SHUFFLE_2(xs, xn, z, 2) \
+		VECTOR8_SHUFFLE_2(xs, xn, w, 3) \
+
+	VECTOR8_SHUFFLE_1(x, 0)
+	VECTOR8_SHUFFLE_1(y, 1)
+	VECTOR8_SHUFFLE_1(z, 2)
+	VECTOR8_SHUFFLE_1(w, 3)
+
+	// a = v0[127:0]
+	// b = v0[255:128]
+	// c = v1[127:0]
+	// d = v1[255:128]
+	// _ = 0
+
+	#define VECTOR8_PERMUTE128_2(as, an, bs, bn) \
+		__forceinline GSVector8 as##bs() const {return GSVector8(_mm256_permute2f128_ps(m, m, an | (bn << 4)));} \
+		__forceinline GSVector8 as##bs(const GSVector8& v) const {return GSVector8(_mm256_permute2f128_ps(m, v.m, an | (bn << 4)));} \
+
+	#define VECTOR8_PERMUTE128_1(as, an) \
+		VECTOR8_PERMUTE128_2(as, an, a, 0) \
+		VECTOR8_PERMUTE128_2(as, an, b, 1) \
+		VECTOR8_PERMUTE128_2(as, an, c, 2) \
+		VECTOR8_PERMUTE128_2(as, an, d, 3) \
+		VECTOR8_PERMUTE128_2(as, an, _, 8) \
+
+	VECTOR8_PERMUTE128_1(a, 0)
+	VECTOR8_PERMUTE128_1(b, 1)
+	VECTOR8_PERMUTE128_1(c, 2)
+	VECTOR8_PERMUTE128_1(d, 3)
+	VECTOR8_PERMUTE128_1(_, 8)
+
+	#if _M_SSE >= 0x501
+
+	// a = v[63:0]
+	// b = v[127:64]
+	// c = v[191:128]
+	// d = v[255:192]
+
+	#define VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, ds, dn) \
+		__forceinline GSVector8 as##bs##cs##ds() const {return GSVector8(_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(m), _MM_SHUFFLE(dn, cn, bn, an))));} \
+
+	#define VECTOR8_PERMUTE64_3(as, an, bs, bn, cs, cn) \
+		VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, a, 0) \
+		VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, b, 1) \
+		VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, c, 2) \
+		VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, d, 3) \
+
+	#define VECTOR8_PERMUTE64_2(as, an, bs, bn) \
+		VECTOR8_PERMUTE64_3(as, an, bs, bn, a, 0) \
+		VECTOR8_PERMUTE64_3(as, an, bs, bn, b, 1) \
+		VECTOR8_PERMUTE64_3(as, an, bs, bn, c, 2) \
+		VECTOR8_PERMUTE64_3(as, an, bs, bn, d, 3) \
+
+	#define VECTOR8_PERMUTE64_1(as, an) \
+		VECTOR8_PERMUTE64_2(as, an, a, 0) \
+		VECTOR8_PERMUTE64_2(as, an, b, 1) \
+		VECTOR8_PERMUTE64_2(as, an, c, 2) \
+		VECTOR8_PERMUTE64_2(as, an, d, 3) \
+
+	VECTOR8_PERMUTE64_1(a, 0)
+	VECTOR8_PERMUTE64_1(b, 1)
+	VECTOR8_PERMUTE64_1(c, 2)
+	VECTOR8_PERMUTE64_1(d, 3)
+
+	__forceinline GSVector8 permute32(const GSVector8i& mask) const
+	{
+		return GSVector8(_mm256_permutevar8x32_ps(m, mask));
+	}
+
+	__forceinline GSVector8 broadcast32() const
+	{
+		return GSVector8(_mm256_broadcastss_ps(_mm256_castps256_ps128(m)));
+	}
+
+	__forceinline static GSVector8 broadcast32(const GSVector4& v)
+	{
+		return GSVector8(_mm256_broadcastss_ps(v.m));
+	}
+
+	__forceinline static GSVector8 broadcast32(const void* f)
+	{
+		return GSVector8(_mm256_broadcastss_ps(_mm_load_ss((const float*)f)));
+	}
+
+	// TODO: v.(x0|y0|z0|w0|x1|y1|z1|w1) // broadcast element
+
+	#endif
+};
+
+#endif
--- a/plugins/GSdx/GSVector8i.h
+++ b/plugins/GSdx/GSVector8i.h
--- a/plugins/GSdx/GSdx.vcxproj
+++ b/plugins/GSdx/GSdx.vcxproj
@ -250,6 +250,10 @@
    <ClInclude Include="GSUniformBufferOGL.h" />
    <ClInclude Include="GSUtil.h" />
    <ClInclude Include="GSVector.h" />
+    <ClInclude Include="GSVector4i.h" />
+    <ClInclude Include="GSVector4.h" />
+    <ClInclude Include="GSVector8i.h" />
+    <ClInclude Include="GSVector8.h" />
    <ClInclude Include="GSVertex.h" />
    <ClInclude Include="GSVertexArrayOGL.h" />
    <ClInclude Include="GSVertexHW.h" />
--- a/plugins/GSdx/GSdx.vcxproj.filters
+++ b/plugins/GSdx/GSdx.vcxproj.filters
@ -467,6 +467,18 @@
    <ClInclude Include="GSVector.h">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="GSVector4i.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSVector4.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSVector8i.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSVector8.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
    <ClInclude Include="GSVertex.h">
      <Filter>Header Files</Filter>
    </ClInclude>