GSdx: More avx2 code to read/write different block formats, the GSBenchmark function shows nice improvements, but no games run faster. I just upload the changes before messing with the drawing part.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5675 96395faa-99c1-11dd-bbfe-3dabce05a288
2013-06-17 04:11:10 +00:00 · 2013-06-17 04:11:10 +00:00 · 8b9f5b5bc2
parent f4ce6d6fce
commit 8b9f5b5bc2
20 changed files with 1467 additions and 596 deletions
--- a/plugins/GSdx/GPUDrawScanline.h
+++ b/plugins/GSdx/GPUDrawScanline.h
@ -71,4 +71,6 @@ public:
 	void DrawRect(const GSVector4i& r, const GSVertexSW& v);

 #endif
+
+	void PrintStats() {m_ds_map.PrintStats();}
 };
--- a/plugins/GSdx/GPURenderer.h
+++ b/plugins/GSdx/GPURenderer.h
@ -120,7 +120,7 @@ protected:
 	void GrowVertexBuffer()
 	{
 		int maxcount = std::max<int>(m_maxcount * 3 / 2, 10000);
-		Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 16);
+		Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 32);

 	    if(m_vertices != NULL)
 		{
--- a/plugins/GSdx/GPURendererSW.cpp
+++ b/plugins/GSdx/GPURendererSW.cpp
@ -27,7 +27,7 @@ GPURendererSW::GPURendererSW(GSDevice* dev, int threads)
 	: GPURendererT<GSVertexSW>(dev)
 	, m_texture(NULL)
 {
-	m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 16);
+	m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 32);

 	m_rl = GSRasterizerList::Create<GPUDrawScanline>(threads, &m_perfmon);
 }
@ -121,7 +121,7 @@ void GPURendererSW::Draw()
 	data->scissor.right = min((int)(m_env.DRAREABR.X + 1) << m_scale.x, m_mem.GetWidth());
 	data->scissor.bottom = min((int)(m_env.DRAREABR.Y + 1) << m_scale.y, m_mem.GetHeight());
 	
-	data->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 16);
+	data->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 32);
 	data->vertex = (GSVertexSW*)data->buff;
 	data->vertex_count = m_count;

--- a/plugins/GSdx/GPUState.cpp
+++ b/plugins/GSdx/GPUState.cpp
@ -747,7 +747,7 @@ GPUState::Buffer::Buffer()
 {
 	bytes = 0;
 	maxbytes = 4096;
-	buff = (uint8*)_aligned_malloc(maxbytes, 16);
+	buff = (uint8*)_aligned_malloc(maxbytes, 32);
 	cur = 0;
 }

@ -761,9 +761,9 @@ void GPUState::Buffer::Reserve(int size)
 	if(size > maxbytes)
 	{
 		int new_maxbytes = (maxbytes + size + 1023) & ~1023;
-		uint8* new_buff = (uint8*)_aligned_malloc(new_maxbytes, 16);
+		uint8* new_buff = (uint8*)_aligned_malloc(new_maxbytes, 32);

-	    if(buff != NULL)
+		if(buff != NULL)
 		{
 			memcpy(new_buff, buff, maxbytes);
 			_aligned_free(buff);
--- a/plugins/GSdx/GSBlock.cpp
+++ b/plugins/GSdx/GSBlock.cpp
@ -23,9 +23,7 @@
 #include "GSBlock.h"

 #if _M_SSE >= 0x501
-const GSVector8i GSBlock::m_r16mask(
-	0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, 
-	0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
+const GSVector8i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
 #else
 const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
 #endif
@ -44,7 +42,7 @@ const GSVector4i GSBlock::m_xgxx(0x000003e0);
 const GSVector4i GSBlock::m_rxxx(0x0000001f);
 #endif

-const GSVector4i GSBlock::m_uw8hmask0 = GSVector4i(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
-const GSVector4i GSBlock::m_uw8hmask1 = GSVector4i(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
-const GSVector4i GSBlock::m_uw8hmask2 = GSVector4i(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
-const GSVector4i GSBlock::m_uw8hmask3 = GSVector4i(6, 6, 6, 6, 7, 7, 7, 7, 14, 14, 14, 14, 15, 15, 15, 15);
+const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
+const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
+const GSVector4i GSBlock::m_uw8hmask2(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
+const GSVector4i GSBlock::m_uw8hmask3(6, 6, 6, 6, 7, 7, 7, 7, 14, 14, 14, 14, 15, 15, 15, 15);
--- a/plugins/GSdx/GSBlock.h
+++ b/plugins/GSdx/GSBlock.h
--- a/plugins/GSdx/GSCapture.cpp
+++ b/plugins/GSdx/GSCapture.cpp
@ -177,7 +177,7 @@ public:
 		: CBaseFilter(NAME("GSSource"), pUnk, this, __uuidof(this), &hr)
 		, m_output(NULL)
 		, m_size(w, h)
-		, m_atpf(10000000i64 / fps)
+		, m_atpf((REFERENCE_TIME)(10000000.0f / fps))
 		, m_now(0)
 	{
 		m_output = new GSSourceOutputPin(m_size, m_atpf, this, this, hr, colorspace);
--- a/plugins/GSdx/GSDrawScanline.h
+++ b/plugins/GSdx/GSDrawScanline.h
@ -85,4 +85,6 @@ public:
 	void WritePixel(const GSVector4i& src, int addr, int i, uint32 psm);

 #endif
+
+	void PrintStats() {m_ds_map.PrintStats();}
 };
--- a/plugins/GSdx/GSDrawingContext.h
+++ b/plugins/GSdx/GSDrawingContext.h
@ -44,8 +44,9 @@ public:
 	struct
 	{
 		GSVector4 in;
+		GSVector4i ex;
 		GSVector4 ofex;
-		uint32 ofxy;
+		GSVector4i ofxy;
 	} scissor;

 	struct
@ -83,6 +84,11 @@ public:

 	void UpdateScissor()
 	{
+		scissor.ex.u16[0] = (uint16)(SCISSOR.SCAX0 << 4);
+		scissor.ex.u16[1] = (uint16)(SCISSOR.SCAY0 << 4);
+		scissor.ex.u16[2] = (uint16)(SCISSOR.SCAX1 << 4);
+		scissor.ex.u16[3] = (uint16)(SCISSOR.SCAY1 << 4);
+
 		scissor.ofex = GSVector4(
 			(int)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX),
 			(int)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY),
@ -95,10 +101,11 @@ public:
 			(int)SCISSOR.SCAX1 + 1,
 			(int)SCISSOR.SCAY1 + 1);

-		uint16 ofx = (uint16)XYOFFSET.OFX - 15;
-		uint16 ofy = (uint16)XYOFFSET.OFY - 15;
+		uint16 ofx = (uint16)XYOFFSET.OFX;
+		uint16 ofy = (uint16)XYOFFSET.OFY;

-		scissor.ofxy = ((ofy << 16) | ofx); // ceil(xy) => (xy - offset + 15) >> 4 => (xy - [offset - 15]) >> 4
+		scissor.ofxy.u32[0] = (ofy << 16) | ofx;
+		scissor.ofxy.u32[1] = ((ofy - 15) << 16) | (ofx - 15); // ceil(xy) => (xy - offset + 15) >> 4 => (xy - [offset - 15]) >> 4
 	}

 	bool DepthRead() const
--- a/plugins/GSdx/GSLocalMemory.cpp
+++ b/plugins/GSdx/GSLocalMemory.cpp
@ -678,7 +678,7 @@ vector<GSVector2i>* GSLocalMemory::GetPage2TileMap(const GIFRegTEX0& TEX0)

 ////////////////////

-template<int psm, int bsx, int bsy, bool aligned>
+template<int psm, int bsx, int bsy, int alignment>
 void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
 {
 	uint32 bp = BITBLTBUF.DBP;
@ -692,14 +692,14 @@ void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* sr
 		{
 			switch(psm)
 			{
-			case PSM_PSMCT32: WriteColumn32<aligned, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMCT16: WriteColumn16<aligned>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMCT16S: WriteColumn16<aligned>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMT8: WriteColumn8<aligned>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
-			case PSM_PSMT4: WriteColumn4<aligned>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
-			case PSM_PSMZ32: WriteColumn32<aligned, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMZ16: WriteColumn16<aligned>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMZ16S: WriteColumn16<aligned>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT32: WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMCT16: WriteColumn16<alignment>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT16S: WriteColumn16<alignment>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMT8: WriteColumn8<alignment>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
+			case PSM_PSMT4: WriteColumn4<alignment>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
+			case PSM_PSMZ32: WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMZ16: WriteColumn16<alignment>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMZ16S: WriteColumn16<alignment>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
 			// TODO
 			default: __assume(0);
 			}
@ -707,7 +707,7 @@ void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* sr
 	}
 }

-template<int psm, int bsx, int bsy, bool aligned>
+template<int psm, int bsx, int bsy, int alignment>
 void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
 {
 	uint32 bp = BITBLTBUF.DBP;
@ -719,14 +719,14 @@ void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src
 		{
 			switch(psm)
 			{
-			case PSM_PSMCT32: WriteBlock32<aligned, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMCT16: WriteBlock16<aligned>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMCT16S: WriteBlock16<aligned>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMT8: WriteBlock8<aligned>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
-			case PSM_PSMT4: WriteBlock4<aligned>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
-			case PSM_PSMZ32: WriteBlock32<aligned, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
-			case PSM_PSMZ16: WriteBlock16<aligned>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
-			case PSM_PSMZ16S: WriteBlock16<aligned>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT32: WriteBlock32<alignment, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMCT16: WriteBlock16<alignment>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT16S: WriteBlock16<alignment>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMT8: WriteBlock8<alignment>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
+			case PSM_PSMT4: WriteBlock4<alignment>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
+			case PSM_PSMZ32: WriteBlock32<alignment, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMZ16: WriteBlock16<alignment>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMZ16S: WriteBlock16<alignment>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
 			// TODO
 			default: __assume(0);
 			}
@ -803,7 +803,7 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
 			case PSM_PSMZ32:
 				ReadColumn32(y, dst, buff, 32);
 				memcpy(&buff[32], &src[x * 4], 32);
-				WriteColumn32<true, 0xffffffff>(y, dst, buff, 32);
+				WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
 				break;
 			case PSM_PSMCT16:
 			case PSM_PSMCT16S:
@ -811,17 +811,17 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
 			case PSM_PSMZ16S:
 				ReadColumn16(y, dst, buff, 32);
 				memcpy(&buff[32], &src[x * 2], 32);
-				WriteColumn16<true>(y, dst, buff, 32);
+				WriteColumn16<32>(y, dst, buff, 32);
 				break;
 			case PSM_PSMT8:
 				ReadColumn8(y, dst, buff, 16);
 				for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + x], 16);
-				WriteColumn8<true>(y, dst, buff, 16);
+				WriteColumn8<32>(y, dst, buff, 16);
 				break;
 			case PSM_PSMT4:
 				ReadColumn4(y, dst, buff, 16);
 				for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + (x >> 1)], 16);
-				WriteColumn4<true>(y, dst, buff, 16);
+				WriteColumn4<32>(y, dst, buff, 16);
 				break;
 			// TODO
 			default:
@ -841,13 +841,19 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*

 		if(h2 > 0)
 		{
-			if(((size_t)&src[l * trbpp >> 3] & 15) == 0 && (srcpitch & 15) == 0)
+			size_t addr = (size_t)&src[l * trbpp >> 3];
+
+			if((addr & 31) == 0 && (srcpitch & 31) == 0)
 			{
-				WriteImageColumn<psm, bsx, bsy, true>(l, r, y, h2, src, srcpitch, BITBLTBUF);
+				WriteImageColumn<psm, bsx, bsy, 32>(l, r, y, h2, src, srcpitch, BITBLTBUF);
+			}
+			else if((addr & 15) == 0 && (srcpitch & 15) == 0)
+			{
+				WriteImageColumn<psm, bsx, bsy, 16>(l, r, y, h2, src, srcpitch, BITBLTBUF);
 			}
 			else
 			{
-				WriteImageColumn<psm, bsx, bsy, false>(l, r, y, h2, src, srcpitch, BITBLTBUF);
+				WriteImageColumn<psm, bsx, bsy, 0>(l, r, y, h2, src, srcpitch, BITBLTBUF);
 			}

 			src += srcpitch * h2;
@ -884,7 +890,7 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
 			case PSM_PSMZ32:
 				ReadColumn32(y, dst, buff, 32);
 				memcpy(&buff[0], &src[x * 4], 32);
-				WriteColumn32<true, 0xffffffff>(y, dst, buff, 32);
+				WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
 				break;
 			case PSM_PSMCT16:
 			case PSM_PSMCT16S:
@ -892,17 +898,17 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
 			case PSM_PSMZ16S:
 				ReadColumn16(y, dst, buff, 32);
 				memcpy(&buff[0], &src[x * 2], 32);
-				WriteColumn16<true>(y, dst, buff, 32);
+				WriteColumn16<32>(y, dst, buff, 32);
 				break;
 			case PSM_PSMT8:
 				ReadColumn8(y, dst, buff, 16);
 				for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + x], 16);
-				WriteColumn8<true>(y, dst, buff, 16);
+				WriteColumn8<32>(y, dst, buff, 16);
 				break;
 			case PSM_PSMT4:
 				ReadColumn4(y, dst, buff, 16);
 				for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + (x >> 1)], 16);
-				WriteColumn4<true>(y, dst, buff, 16);
+				WriteColumn4<32>(y, dst, buff, 16);
 				break;
 			// TODO
 			default:
@ -982,13 +988,19 @@ void GSLocalMemory::WriteImage(int& tx, int& ty, const uint8* src, int len, GIFR

 				if(h2 > 0)
 				{
-					if(((size_t)&s[la * trbpp >> 3] & 15) == 0 && (srcpitch & 15) == 0)
+					size_t addr = (size_t)&s[la * trbpp >> 3];
+
+					if((addr & 31) == 0 && (srcpitch & 31) == 0)
 					{
-						WriteImageBlock<psm, bsx, bsy, true>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
+						WriteImageBlock<psm, bsx, bsy, 32>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
+					}
+					else if((addr & 15) == 0 && (srcpitch & 15) == 0)
+					{
+						WriteImageBlock<psm, bsx, bsy, 16>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
 					}
 					else
 					{
-						WriteImageBlock<psm, bsx, bsy, false>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
+						WriteImageBlock<psm, bsx, bsy, 0>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
 					}

 					s += srcpitch * h2;
@ -1385,6 +1397,8 @@ void GSLocalMemory::ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITB
 	int sx = (int)TRXPOS.SSAX;
 	int ex = sx + (int)TRXREG.RRW;

+	// printf("spsm=%d x=%d ex=%d y=%d len=%d\n", BITBLTBUF.SPSM, x, ex, y, len);
+
 	switch(BITBLTBUF.SPSM)
 	{
 	case PSM_PSMCT32:
@ -1399,12 +1413,21 @@ void GSLocalMemory::ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITB
 			int* RESTRICT offset = psm->rowOffset[y & 7];
 			uint32* RESTRICT ps = &m_vm32[psm->pa(0, y, bp, bw)];

-			for(int ex4 = ex - 4; len >= 4 && x <= ex4; len -= 4, x += 4, pd += 4)
+			for(; len > 0 && x < ex && (x & 7); len--, x++, pd++) 
 			{
-				pd[0] = ps[offset[x + 0]];
-				pd[1] = ps[offset[x + 1]];
-				pd[2] = ps[offset[x + 2]];
-				pd[3] = ps[offset[x + 3]];
+				*pd = ps[offset[x]];
+			}
+
+			// aligned to a column
+
+			for(int ex8 = ex - 8; len >= 8 && x <= ex8; len -= 8, x += 8, pd += 8)
+			{
+				int o = offset[x];
+
+				GSVector4i::store<false>(&pd[0], GSVector4i::load(&ps[o + 0], &ps[o + 4]));
+				GSVector4i::store<false>(&pd[4], GSVector4i::load(&ps[o + 8], &ps[o + 12]));
+
+				for(int i = 0; i < 8; i++) ASSERT(pd[i] == ps[offset[x + i]]);
 			}

 			for(; len > 0 && x < ex; len--, x++, pd++)
--- a/plugins/GSdx/GSLocalMemory.h
+++ b/plugins/GSdx/GSLocalMemory.h
@ -845,10 +845,10 @@ public:

 	//

-	template<int psm, int bsx, int bsy, bool aligned>
+	template<int psm, int bsx, int bsy, int alignment>
 	void WriteImageColumn(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF);

-	template<int psm, int bsx, int bsy, bool aligned>
+	template<int psm, int bsx, int bsy, int alignment>
 	void WriteImageBlock(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF);

 	template<int psm, int bsx, int bsy>
--- a/plugins/GSdx/GSRasterizer.cpp
+++ b/plugins/GSdx/GSRasterizer.cpp
@ -467,7 +467,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index)
 		{
 			edge = vertex[i[1 - m2]];

-			edge.p = edge.p.insert<0, 1>(vertex[i[m2]].p);
+			edge.p = edge.p.insert32<0, 1>(vertex[i[m2]].p);
 			dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p);

 			DrawTriangleSection(tb.x, tb.w, edge, dedge, dscan, vertex[i[1 - m2]].p);
@ -489,7 +489,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index)
 		{
 			edge = v1;

-			edge.p = v0.p.xxxx().addm(ddx[m2], dv[0].p.yyyy()).xyzw(edge.p);
+			edge.p = (v0.p.xxxx() + ddx[m2] * dv[0].p.yyyy()).xyzw(edge.p);
 			dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p);

 			DrawTriangleSection(tb.y, tb.w, edge, dedge, dscan, v1.p);
@ -532,7 +532,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co

 		GSVertexSW scan;

-		scan.p = edge.p.addm(dedge.p, dy);
+		scan.p = edge.p + dedge.p * dy;

 		GSVector4 lrf = scan.p.ceil();
 		GSVector4 l = lrf.max(scissor);
@ -546,16 +546,18 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co

 		if(pixels > 0)
 		{
-			scan.t = edge.t.addm(dedge.t, dy);
-			scan.c = edge.c.addm(dedge.c, dy);
+			scan.t = edge.t + dedge.t * dy;
+			scan.c = edge.c + dedge.c * dy;

 			GSVector4 prestep = (l - p0).xxxx();

-			scan.p = scan.p.addm(dscan.p, prestep);
-			scan.t = scan.t.addm(dscan.t, prestep);
-			scan.c = scan.c.addm(dscan.c, prestep);
+			scan.p = scan.p + dscan.p * prestep;
+			scan.t = scan.t + dscan.t * prestep;
+			scan.c = scan.c + dscan.c * prestep;

 			AddScanline(e++, pixels, left, top, scan);
+
+			//m_pixels += pixels; m_ds->DrawScanline(pixels, left, top, scan);
 		}

 		top++;
@ -629,8 +631,8 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index)
 	GSVertexSW dedge;
 	GSVertexSW dscan;

-	dedge.t = GSVector4::zero().insert<1, 1>(dt);
-	dscan.t = GSVector4::zero().insert<0, 0>(dt);
+	dedge.t = GSVector4::zero().insert32<1, 1>(dt);
+	dscan.t = GSVector4::zero().insert32<0, 0>(dt);

 	GSVector4 prestep = GSVector4(r.left, r.top) - scan.p;

@ -851,9 +853,9 @@ void GSRasterizer::AddScanline(GSVertexSW* e, int pixels, int left, int top, con
 {
 	*e = scan;

-	e->p.i16[0] = (int16)pixels;
-	e->p.i16[1] = (int16)left;
-	e->p.i16[2] = (int16)top;
+	e->_pad.i32[0] = pixels;
+	e->_pad.i32[1] = left;
+	e->_pad.i32[2] = top;
 }

 void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan, bool edge)
@ -873,9 +875,9 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS
 		{
 			do
 			{
-				int pixels = e->p.i16[0];
-				int left = e->p.i16[1];
-				int top = e->p.i16[2];
+				int pixels = e->_pad.i32[0];
+				int left = e->_pad.i32[1];
+				int top = e->_pad.i32[2];

 				m_pixels += pixels;

@ -887,9 +889,9 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS
 		{
 			do
 			{
-				int pixels = e->p.i16[0];
-				int left = e->p.i16[1];
-				int top = e->p.i16[2];
+				int pixels = e->_pad.i32[0];
+				int left = e->_pad.i32[1];
+				int top = e->_pad.i32[2];

 				m_pixels += pixels;

--- a/plugins/GSdx/GSRasterizer.h
+++ b/plugins/GSdx/GSRasterizer.h
@ -104,6 +104,8 @@ public:
 	
 #endif

+	virtual void PrintStats() = 0;
+
 	__forceinline bool HasEdge() const {return m_de != NULL;}
 	__forceinline bool IsSolidRect() const {return m_dr != NULL;}
 };
@ -117,6 +119,7 @@ public:
 	virtual void Sync() = 0;
 	virtual bool IsSynced() const = 0;
 	virtual int GetPixels(bool reset = true) = 0;
+	virtual void PrintStats() = 0;
 };

 __aligned(class, 32) GSRasterizer : public IRasterizer
@ -164,10 +167,10 @@ public:
 	void Sync() {}
 	bool IsSynced() const {return true;}
 	int GetPixels(bool reset);
+	void PrintStats() {m_ds->PrintStats();}
 };

-class GSRasterizerList 
-	: public IRasterizer
+class GSRasterizerList : public IRasterizer
 {
 protected:
 	class GSWorker : public GSJobQueue<shared_ptr<GSRasterizerData> >
@ -221,4 +224,5 @@ public:
 	void Sync();
 	bool IsSynced() const;
 	int GetPixels(bool reset);
+	void PrintStats() {}
 };
--- a/plugins/GSdx/GSRendererSW.cpp
+++ b/plugins/GSdx/GSRendererSW.cpp
@ -28,6 +28,10 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;

 const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);

+#if _M_SSE >= 0x501
+const GSVector8 g_pos_scale2(1.0f / 16, 1.0f / 16, 1.0f, 128.0f, 1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
+#endif
+
 GSRendererSW::GSRendererSW(int threads)
 	: m_fzb(NULL)
 {
@ -210,7 +214,7 @@ void GSRendererSW::VSync(int field)

 	m_tc->IncAge();

-	// if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats();
+	// if((m_perfmon.GetFrame() & 255) == 0) m_rl->PrintStats();
 }

 void GSRendererSW::ResetDevice()
@ -263,18 +267,80 @@ GSTexture* GSRendererSW::GetOutput(int i)
 template<uint32 primclass, uint32 tme, uint32 fst>
 void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count)
 {
-	size_t i = m_vertex.next;
+	#if 0//_M_SSE >= 0x501

+	// TODO: something isn't right here, this makes other functions slower (split load/store? old sse code in 3rd party lib?)
+
+	GSVector8i o2((GSVector4i)m_context->XYOFFSET);
+	GSVector8 tsize2(GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0));
+
+	for(int i = (int)m_vertex.next; i > 0; i -= 2, src += 2, dst += 2) // ok to overflow, allocator makes sure there is one more dummy vertex
+	{
+		GSVector8i v0 = GSVector8i::load<true>(src[0].m);
+		GSVector8i v1 = GSVector8i::load<true>(src[1].m);
+
+		GSVector8 stcq = GSVector8::cast(v0.ac(v1));
+		GSVector8i xyzuvf = v0.bd(v1);
+
+		//GSVector8 stcq = GSVector8::load(&src[0].m[0], &src[1].m[0]);
+		//GSVector8i xyzuvf = GSVector8i::load(&src[0].m[1], &src[1].m[1]);
+
+		GSVector8i xy = xyzuvf.upl16() - o2;
+		GSVector8i zf = xyzuvf.ywww().min_u32(GSVector8i::xffffff00());
+
+		GSVector8 p = GSVector8(xy).xyxy(GSVector8(zf) + (GSVector8::m_x4f800000 & GSVector8::cast(zf.sra32(31)))) * g_pos_scale2;
+		GSVector8 c = GSVector8(GSVector8i::cast(stcq).uph8().upl16() << 7);
+
+		GSVector8 t = GSVector8::zero();
+
+		if(tme)
+		{
+			if(fst)
+			{
+				t = GSVector8(xyzuvf.uph16() << (16 - 4));
+			}
+			else
+			{
+				t = stcq.xyww() * tsize2;
+			}
+		}
+
+		if(primclass == GS_SPRITE_CLASS)
+		{
+			t = t.insert32<1, 3>(GSVector8::cast(xyzuvf));
+		}
+/*
+		if(tme || primclass == GS_SPRITE_CLASS) 
+		{
+			GSVector8::store<true>(&dst[0].p, p.ac(t));
+		}
+		else 
+		{
+			GSVector8::storel(&dst[0].p, p);
+		}
+*/
+		GSVector8::store<true>(&dst[0].p, p.ac(t));
+		GSVector8::store<true>(&dst[0].c, c.a_());
+/*
+		if(tme || primclass == GS_SPRITE_CLASS) 
+		{
+			GSVector8::store<true>(&dst[1].p, p.bd(t));
+		}
+		else 
+		{
+			GSVector8::storeh(&dst[1].p, p);
+		}
+*/
+		GSVector8::store<true>(&dst[1].p, p.bd(t));
+		GSVector8::store<true>(&dst[1].c, c.b_());
+	}
+
+	#else
+	
 	GSVector4i o = (GSVector4i)m_context->XYOFFSET;
 	GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);

-	#if _M_SSE >= 0x501
-
-	// TODO: process vertices in pairs, when AVX2 becomes available
-
-	#endif
-	
-	for(; i > 0; i--, src++, dst++)
+	for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
 	{
 		GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q

@ -297,7 +363,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 		dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale;
 		dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7);

-		GSVector4 t;
+		GSVector4 t = GSVector4::zero();

 		if(tme)
 		{
@ -323,17 +389,25 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
 		{
 			#if _M_SSE >= 0x401

-			t = t.insert<1, 3>(GSVector4::cast(xyzuvf));
-				
+			t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
+
 			#else
-				
-			t = t.insert<0, 3>(GSVector4::cast(GSVector4i::load(z)));
+
+			t = t.insert32<0, 3>(GSVector4::cast(GSVector4i::load(z)));

 			#endif
 		}

 		dst->t = t;
+
+		#if _M_SSE >= 0x501
+
+		dst->_pad = GSVector4::zero();
+
+		#endif
 	}
+
+	#endif
 }

 void GSRendererSW::Draw()
@ -345,10 +419,10 @@ void GSRendererSW::Draw()
 	shared_ptr<GSRasterizerData> data(sd);

 	sd->primclass = m_vt.m_primclass;
-	sd->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_vertex.next + sizeof(uint32) * m_index.tail, 32);
+	sd->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * ((m_vertex.next + 1) & ~1) + sizeof(uint32) * m_index.tail, 32);
 	sd->vertex = (GSVertexSW*)sd->buff;
 	sd->vertex_count = m_vertex.next;
-	sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * m_vertex.next);
+	sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * ((m_vertex.next + 1) & ~1));
 	sd->index_count = m_index.tail;

 	(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(sd->vertex, m_vertex.buff, m_vertex.next);
@ -631,6 +705,20 @@ void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
 	}
 }

+__forceinline void Increment16(volatile short* lpAddend)
+{
+	// (*lpAddend)++;
+
+	_InterlockedIncrement16(lpAddend);
+}
+
+__forceinline void Decrement16(volatile short* lpAddend)
+{
+	// (*lpAddend)--;
+
+	_InterlockedDecrement16(lpAddend);
+}
+	
 void GSRendererSW::UsePages(const uint32* pages, int type)
 {
 	if(type < 2)
@ -639,7 +727,7 @@ void GSRendererSW::UsePages(const uint32* pages, int type)
 		{
 			ASSERT(((short*)&m_fzb_pages[*p])[type] < SHRT_MAX);

-			_InterlockedIncrement16((short*)&m_fzb_pages[*p] + type);
+			Increment16((short*)&m_fzb_pages[*p] + type);
 		}
 	}
 	else
@ -648,7 +736,7 @@ void GSRendererSW::UsePages(const uint32* pages, int type)
 		{
 			ASSERT(m_tex_pages[*p] < SHRT_MAX);

-			_InterlockedIncrement16((short*)&m_tex_pages[*p]); // remember which texture pages are used
+			Increment16((short*)&m_tex_pages[*p]);
 		}
 	}
 }
@ -661,7 +749,7 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type)
 		{
 			ASSERT(((short*)&m_fzb_pages[*p])[type] > 0);

-			_InterlockedDecrement16((short*)&m_fzb_pages[*p] + type);
+			Decrement16((short*)&m_fzb_pages[*p] + type);
 		}
 	}
 	else
@ -670,7 +758,7 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type)
 		{
 			ASSERT(m_tex_pages[*p] > 0);

-			_InterlockedDecrement16((short*)&m_tex_pages[*p]);
+			Decrement16((short*)&m_tex_pages[*p]);
 		}
 	}
 }
@ -1390,23 +1478,29 @@ GSRendererSW::SharedData::~SharedData()
 	fflush(s_fp);}
 }

+static TransactionScope::Lock s_lock;
+
 void GSRendererSW::SharedData::UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm)
 {
 	if(m_using_pages) return;

-	if(global.sel.fb)
 	{
-		m_parent->UsePages(fb_pages, 0);
-	}
+		//TransactionScope scope(s_lock);

-	if(global.sel.zb)
-	{
-		m_parent->UsePages(zb_pages, 1);
-	}
+		if(global.sel.fb)
+		{
+			m_parent->UsePages(fb_pages, 0);
+		}

-	for(size_t i = 0; m_tex[i].t != NULL; i++)
-	{
-		m_parent->UsePages(m_tex[i].t->m_pages.n, 2);
+		if(global.sel.zb)
+		{
+			m_parent->UsePages(zb_pages, 1);
+		}
+
+		for(size_t i = 0; m_tex[i].t != NULL; i++)
+		{
+			m_parent->UsePages(m_tex[i].t->m_pages.n, 2);
+		}
 	}

 	m_fb_pages = fb_pages;
@ -1421,19 +1515,23 @@ void GSRendererSW::SharedData::ReleasePages()
 {
 	if(!m_using_pages) return;

-	if(global.sel.fb)
 	{
-		m_parent->ReleasePages(m_fb_pages, 0);
-	}
+		//TransactionScope scope(s_lock);

-	if(global.sel.zb)
-	{
-		m_parent->ReleasePages(m_zb_pages, 1);
-	}
+		if(global.sel.fb)
+		{
+			m_parent->ReleasePages(m_fb_pages, 0);
+		}

-	for(size_t i = 0; m_tex[i].t != NULL; i++)
-	{
-		m_parent->ReleasePages(m_tex[i].t->m_pages.n, 2);
+		if(global.sel.zb)
+		{
+			m_parent->ReleasePages(m_zb_pages, 1);
+		}
+
+		for(size_t i = 0; m_tex[i].t != NULL; i++)
+		{
+			m_parent->ReleasePages(m_tex[i].t->m_pages.n, 2);
+		}
 	}

 	delete [] m_fb_pages;
--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@ -516,7 +516,7 @@ void GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* RESTRICT r)
 	*/
 	GSVector4i xy = GSVector4i::loadl(&r->u64[0]);
 	GSVector4i zf = GSVector4i::loadl(&r->u64[1]);
-	xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV));
+	xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
 	zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());

 	m_v.m[1] = xy.upl32(zf);
@ -567,14 +567,19 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, ui
 		GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
 		GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
 		GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
-
+		/*
+		GSVector4i rg = GSVector4i::loadl(&r[1].u64[0]);
+		GSVector4i ba = GSVector4i::loadl(&r[1].u64[1]);
+		GSVector4i rbga = rg.upl8(ba);
+		GSVector4i rgba = rbga.upl8(rbga.zzzz());
+		*/
 		q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ

 		m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one

 		GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]);
 		GSVector4i zf = GSVector4i::loadl(&r[2].u64[1]);
-		xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV));
+		xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
 		zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());

 		m_v.m[1] = xy.upl32(zf); // TODO: only store the last one
@ -599,7 +604,12 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, uin
 		GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
 		GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
 		GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
-
+		/*
+		GSVector4i rg = GSVector4i::loadl(&r[1].u64[0]);
+		GSVector4i ba = GSVector4i::loadl(&r[1].u64[1]);
+		GSVector4i rbga = rg.upl8(ba);
+		GSVector4i rgba = rbga.upl8(rbga.zzzz());
+		*/
 		q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ

 		m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
@ -719,7 +729,7 @@ void GSState::GIFRegHandlerXYZF2(const GIFReg* RESTRICT r)

 	GSVector4i xyzf = GSVector4i::loadl(&r->XYZF);
 	GSVector4i xyz = xyzf & (GSVector4i::xffffffff().upl32(GSVector4i::x00ffffff()));
-	GSVector4i uvf = GSVector4i::loadl(&m_v.UV).upl32(xyzf.srl32(24).srl<4>());
+	GSVector4i uvf = GSVector4i::load((int)m_v.UV).upl32(xyzf.srl32(24).srl<4>());
 	
 	m_v.m[1] = xyz.upl64(uvf);

@ -2258,7 +2268,7 @@ void GSState::UpdateContext()

 void GSState::UpdateScissor()
 {
-	m_scissor = m_context->scissor.ofex;
+	m_scissor = m_context->scissor.ex;
 	m_ofxy = m_context->scissor.ofxy;
 }

@ -2286,8 +2296,8 @@ void GSState::GrowVertexBuffer()
 {
 	int maxcount = std::max<int>(m_vertex.maxcount * 3 / 2, 10000);

-	GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 16);
-	uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 16); // worst case is slightly less than vertex number * 3
+	GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 32);
+	uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 32); // worst case is slightly less than vertex number * 3

 	if(m_vertex.buff != NULL)
 	{
@ -2328,7 +2338,13 @@ __forceinline void GSState::VertexKick(uint32 skip)
 	tailptr[0] = v0;
 	tailptr[1] = v1;

-	m_vertex.xy[xy_tail & 3] = GSVector4(v1.upl32(v1.sub16(GSVector4i::load(m_ofxy)).sra16(4)).upl16()); // zw not sign extended, only useful for eq tests
+	GSVector4i xy = v1.xxxx().sub16(m_ofxy);
+
+	#if _M_SSE >= 0x401
+	GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend32<2>(xy.sra16(4)));
+	#else
+	GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl32(xy.sra16(4).yyyy()));
+	#endif

 	m_vertex.tail = ++tail;
 	m_vertex.xy_tail = ++xy_tail;
@ -2356,14 +2372,14 @@ __forceinline void GSState::VertexKick(uint32 skip)

 	if(skip == 0 && (prim != GS_TRIANGLEFAN || m <= 4)) // m_vertex.xy only knows about the last 4 vertices, head could be far behind for fan
 	{
-		GSVector4 v0, v1, v2, v3;
+		GSVector4i v0, v1, v2, v3, pmin, pmax;

-		v0 = m_vertex.xy[(xy_tail + 1) & 3]; // T-3
-		v1 = m_vertex.xy[(xy_tail + 2) & 3]; // T-2
-		v2 = m_vertex.xy[(xy_tail + 3) & 3]; // T-1
-		v3 = m_vertex.xy[(xy_tail - m) & 3]; // H
+		v0 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 1) & 3]); // T-3
+		v1 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 2) & 3]); // T-2
+		v2 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 3) & 3]); // T-1
+		v3 = GSVector4i::loadl(&m_vertex.xy[(xy_tail - m) & 3]); // H

-		GSVector4 pmin, pmax, cross;
+		GSVector4 cross;

 		switch(prim)
 		{
@ -2374,21 +2390,21 @@ __forceinline void GSState::VertexKick(uint32 skip)
 		case GS_LINELIST:
 		case GS_LINESTRIP:
 		case GS_SPRITE:
-			pmin = v2.min(v1);
-			pmax = v2.max(v1);
+			pmin = v2.min_i16(v1);
+			pmax = v2.max_i16(v1);
 			break;
 		case GS_TRIANGLELIST:
 		case GS_TRIANGLESTRIP:
-			pmin = v2.min(v1.min(v0));
-			pmax = v2.max(v1.max(v0));
+			pmin = v2.min_i16(v1.min_i16(v0));
+			pmax = v2.max_i16(v1.max_i16(v0));
 			break;
 		case GS_TRIANGLEFAN:
-			pmin = v2.min(v1.min(v3));
-			pmax = v2.max(v1.max(v3));
+			pmin = v2.min_i16(v1.min_i16(v3));
+			pmax = v2.max_i16(v1.max_i16(v3));
 			break;
 		}

-		GSVector4 test = pmax < m_scissor | pmin > m_scissor.zwxy(); 
+		GSVector4i test = pmax.lt16(m_scissor) | pmin.gt16(m_scissor.zwzwl()); 
 		
 		switch(prim)
 		{
@ -2396,7 +2412,7 @@ __forceinline void GSState::VertexKick(uint32 skip)
 		case GS_TRIANGLESTRIP:
 		case GS_TRIANGLEFAN:
 		case GS_SPRITE:
-			test |= m_nativeres ? (pmin == pmax).zwzw() : pmin == pmax;
+			test |= m_nativeres ? pmin.eq16(pmax).zwzwl() : pmin.eq16(pmax);
 			break;
 		}

@ -2404,16 +2420,19 @@ __forceinline void GSState::VertexKick(uint32 skip)
 		{
 		case GS_TRIANGLELIST:
 		case GS_TRIANGLESTRIP:
-			cross = (v2 - v1) * (v2 - v0).yxwz();
-			test |= cross == cross.yxwz();
+			// TODO: any way to do a 16-bit integer cross product?
+			cross = GSVector4(v2.xyxyl().i16to32().sub32(v0.upl32(v1).i16to32())); // x20, y20, x21, y21
+			cross = cross * cross.wzwz(); // x20 * y21, y20 * x21
+			test |= GSVector4i::cast(cross == cross.yxwz());
 			break;
 		case GS_TRIANGLEFAN:
-			cross = (v2 - v1) * (v2 - v3).yxwz();
-			test |= cross == cross.yxwz();
+			cross = GSVector4(v2.xyxyl().i16to32().sub32(v3.upl32(v1).i16to32())); // x23, y23, x21, y21
+			cross = cross * cross.wzwz(); // x23 * y21, y23 * x21
+			test |= GSVector4i::cast(cross == cross.yxwz());
 			break;
 		}
 		
-		skip |= test.mask() & 3;
+		skip |= test.mask() & 15;
 	}

 	if(skip != 0)
--- a/plugins/GSdx/GSState.h
+++ b/plugins/GSdx/GSState.h
@ -148,16 +148,16 @@ protected:

 	GSVertex m_v;
 	float m_q;
-	GSVector4 m_scissor;
-	uint32 m_ofxy;
+	GSVector4i m_scissor;
+	GSVector4i m_ofxy;
 	bool m_texflush;
 	
 	struct 
 	{
 		GSVertex* buff; 
 		size_t head, tail, next, maxcount; // head: first vertex, tail: last vertex + 1, next: last indexed + 1
-		GSVector4 xy[4]; 
 		size_t xy_tail;
+		uint64 xy[4];
 	} m_vertex; 

 	struct 
--- a/plugins/GSdx/GSThread.h
+++ b/plugins/GSdx/GSThread.h
@ -367,3 +367,99 @@ public:

 	virtual void Process(T& item) = 0;
 };
+
+// http://software.intel.com/en-us/blogs/2012/11/06/exploring-intel-transactional-synchronization-extensions-with-intel-software
+
+class TransactionScope
+{
+public:
+	class Lock
+	{
+		volatile long state;
+
+	public:
+		Lock() 
+			: state(0) 
+		{
+		}
+
+		void lock()
+		{
+			while(_InterlockedCompareExchange(&state, 1, 0) != 0)
+			{
+				do {_mm_pause();} while(state == 1);
+			}
+		}
+
+		void unlock() 
+		{
+			_InterlockedExchange(&state, 0);
+		}
+
+		bool isLocked() const 
+		{
+			return state == 1;
+		}
+	};
+
+private:
+	Lock& fallBackLock;
+
+	TransactionScope();
+
+public:
+	TransactionScope(Lock& fallBackLock_, int max_retries = 3) 
+		: fallBackLock(fallBackLock_)
+	{
+		#if _M_SSE >= 0x501
+
+		int nretries = 0;
+		
+		while(1)
+		{
+			++nretries;
+
+			unsigned status = _xbegin();
+
+			if(status == _XBEGIN_STARTED)
+			{
+				if(!fallBackLock.isLocked()) return;
+
+				_xabort(0xff); 
+			}
+
+			if((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff && !(status & _XABORT_NESTED))
+			{
+				while(fallBackLock.isLocked()) _mm_pause();
+			}
+			else if(!(status & _XABORT_RETRY))
+			{
+				break;
+			}
+
+			if(nretries >= max_retries) 
+			{
+				break;
+			}
+		}
+
+		#endif
+
+		fallBackLock.lock();
+	}
+
+	~TransactionScope()
+	{
+		if(fallBackLock.isLocked())
+		{
+			fallBackLock.unlock();
+		}
+		#if _M_SSE >= 0x501
+		else
+		{
+			_xend();
+		}
+		#endif
+	}
+};
+
--- a/plugins/GSdx/GSVector.cpp
+++ b/plugins/GSdx/GSVector.cpp
@ -78,6 +78,8 @@ const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f80000
 const GSVector8 GSVector8::m_one(1.0f);
 const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
 const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
+const GSVector8 GSVector8::m_x4b000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4b000000)));
+const GSVector8 GSVector8::m_x4f800000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4f800000)));

 #endif

@ -209,4 +211,4 @@ GSVector4i GSVector4i::fit(int preset) const
 	}

 	return r;
-}
+}
--- a/plugins/GSdx/GSVector.h
+++ b/plugins/GSdx/GSVector.h
@ -159,7 +159,7 @@ public:

 	__forceinline explicit GSVector4i(int i)
 	{
-		m = _mm_set1_epi32(i);
+		*this = i;
 	}

 	__forceinline explicit GSVector4i(__m128i m)
@ -190,7 +190,15 @@ public:

 	__forceinline void operator = (int i)
 	{
+		#if _M_SSE >= 0x501
+
+		m = _mm_broadcastd_epi32(_mm_cvtsi32_si128(i));
+
+		#else
+
 		m = _mm_set1_epi32(i);
+
+		#endif
 	}

 	__forceinline void operator = (__m128i m)
@ -790,6 +798,16 @@ public:
 		return upl32();
 	}

+	__forceinline GSVector4i i8to16() const
+	{
+		return zero().upl8().sra16(8);
+	}
+
+	__forceinline GSVector4i i16to32() const
+	{
+		return zero().upl16().sra32(16);
+	}
+
 	#endif

 	template<int i> __forceinline GSVector4i srl() const
@ -1245,6 +1263,15 @@ public:

 	#if _M_SSE >= 0x401

+	template<int i> __forceinline GSVector4i blend32(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_blend_epi32(m, v.m, i));
+	}
+
+	#endif
+
+	#if _M_SSE >= 0x401
+
 	template<int src, class T> __forceinline GSVector4i gather8_4(const T* ptr) const
 	{
 		GSVector4i v;
@ -2446,23 +2473,31 @@ public:
 		m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v));
 	}

-	__forceinline explicit GSVector4(float f)
-	{
-		m = _mm_set1_ps(f);
-	}
-
 	__forceinline explicit GSVector4(__m128 m)
 	{
 		this->m = m;
 	}

+	__forceinline explicit GSVector4(float f)
+	{
+		*this = f;
+	}
+
 	__forceinline explicit GSVector4(int i)
 	{
+		#if _M_SSE >= 0x501
+
+		m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
+
+		#else
+
 		GSVector4i v((int)i);

 		*this = GSVector4(v);
-	}

+		#endif
+	}
+	
 	__forceinline explicit GSVector4(uint32 u)
 	{
 		GSVector4i v((int)u);
@ -2493,7 +2528,15 @@ public:

 	__forceinline void operator = (float f)
 	{
+		#if _M_SSE >= 0x501
+
+		m =  _mm_broadcastss_ps(_mm_load_ss(&f));
+
+		#else
+
 		m = _mm_set1_ps(f);
+
+		#endif
 	}

 	__forceinline void operator = (__m128 m)
@ -2788,6 +2831,15 @@ public:
 		return GSVector4(_mm_max_ps(m, a));
 	}

+	#if _M_SSE >= 0x401
+
+	template<int mask> __forceinline GSVector4 blend32(const GSVector4& a)  const
+	{
+		return GSVector4(_mm_blend_ps(m, a, mask));
+	}
+
+	#endif
+
 	__forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask)  const
 	{
 		#if _M_SSE >= 0x401
@ -2855,7 +2907,7 @@ public:
 		#endif
 	}

-	template<int src, int dst> __forceinline GSVector4 insert(const GSVector4& v) const
+	template<int src, int dst> __forceinline GSVector4 insert32(const GSVector4& v) const
 	{
 		// TODO: use blendps when src == dst

@ -2918,7 +2970,7 @@ public:
 		return *this;
 	}

-	template<int i> __forceinline int extract() const
+	template<int i> __forceinline int extract32() const
 	{
 		#if _M_SSE >= 0x401

@ -3273,7 +3325,15 @@ public:

 	__forceinline GSVector8i(__m128i m0, __m128i m1)
 	{
-		this->m = zero().insert<0>(m0).insert<1>(m1);
+		#if 0 // _MSC_VER >= 1700 
+		
+		this->m = _mm256_permute2x128_si256(_mm256_castsi128_si256(m0), _mm256_castsi128_si256(m1), 0);
+		
+		#else
+		
+		*this = zero().insert<0>(m0).insert<1>(m1);
+
+		#endif
 	}

 	__forceinline GSVector8i(const GSVector8i& v)
@ -3283,20 +3343,12 @@ public:

 	__forceinline explicit GSVector8i(int i)
 	{
-		#if _M_SSE >= 0x501
-		m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i));
-		#else
-		m = _mm256_set1_epi32(i);
-		#endif
+		*this = i;
 	}

 	__forceinline explicit GSVector8i(__m128i m)
 	{
-		#if _M_SSE >= 0x501
-		this->m = _mm256_broadcastsi128_si256(m);
-		#else
-		this->m = zero().insert<0>(m).insert<1>(m);
-		#endif
+		*this = m;
 	}

 	__forceinline explicit GSVector8i(__m256i m)
@ -3311,19 +3363,19 @@ public:

 	__forceinline void operator = (int i)
 	{
-		#if _M_SSE >= 0x501
-		m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i));
-		#else
-		m = _mm256_set1_epi32(i);
-		#endif
+		m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)); // m = _mm256_set1_epi32(i);
 	}

 	__forceinline void operator = (__m128i m)
 	{
-		#if _M_SSE >= 0x501
-		this->m = _mm256_broadcastsi128_si256(m);
+		#if 0 // _MSC_VER >= 1700 
+		
+		this->m = _mm256_permute2x128_si256(_mm256_castsi128_si256(m), _mm256_castsi128_si256(m), 0);
+		
 		#else
-		this->m = zero().insert<0>(m).insert<1>(m);
+		
+		*this = zero().insert<0>(m).aa();
+
 		#endif
 	}

@ -3609,74 +3661,78 @@ public:
 		return GSVector8i(_mm256_unpackhi_epi64(m, _mm256_setzero_si256()));
 	}

-	__forceinline GSVector8i i8to16() const
+	// cross lane! from 128-bit to full 256-bit range
+
+	__forceinline GSVector8i i8to16c() const
 	{
 		return GSVector8i(_mm256_cvtepi8_epi16(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i u8to16() const
+	__forceinline GSVector8i u8to16c() const
 	{
 		return GSVector8i(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i i8to32() const
+	__forceinline GSVector8i i8to32c() const
 	{
 		return GSVector8i(_mm256_cvtepi8_epi32(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i u8to32() const
+	__forceinline GSVector8i u8to32c() const
 	{
 		return GSVector8i(_mm256_cvtepu8_epi32(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i i8to64() const
+	__forceinline GSVector8i i8to64c() const
 	{
 		return GSVector8i(_mm256_cvtepi8_epi64(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i u8to64() const
+	__forceinline GSVector8i u8to64c() const
 	{
 		return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i i16to32() const
+	__forceinline GSVector8i i16to32c() const
 	{
 		return GSVector8i(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i u16to32() const
+	__forceinline GSVector8i u16to32c() const
 	{
 		return GSVector8i(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i i16to64() const
+	__forceinline GSVector8i i16to64c() const
 	{
 		return GSVector8i(_mm256_cvtepi16_epi64(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i u16to64() const
+	__forceinline GSVector8i u16to64c() const
 	{
 		return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i i32to64() const
+	__forceinline GSVector8i i32to64c() const
 	{
 		return GSVector8i(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(m)));
 	}

-	__forceinline GSVector8i u32to64() const
+	__forceinline GSVector8i u32to64c() const
 	{
 		return GSVector8i(_mm256_cvtepu32_epi64(_mm256_castsi256_si128(m)));
 	}

+	//
+
 	template<int i> __forceinline GSVector8i srl() const
 	{
-		return GSVector8i(_mm256_srli_si156(m, i));
+		return GSVector8i(_mm256_srli_si256(m, i));
 	}

 	template<int i> __forceinline GSVector8i srl(const GSVector8i& v)
 	{
-		return GSVector4i(_mm256_alignr_epi8(v.m, m, i));
+		return GSVector8i(_mm256_alignr_epi8(v.m, m, i));
 	}

 	template<int i> __forceinline GSVector8i sll() const
@ -4065,12 +4121,14 @@ public:

 	template<int i> __forceinline GSVector4i extract() const
 	{
-		return GSVector4i(_mm256_extractf128_si256(m, i));
+		if(i == 0) return GSVector4i(_mm256_castsi256_si128(m));
+
+		return GSVector4i(_mm256_extracti128_si256(m, i));
 	}

 	template<int i> __forceinline GSVector8i insert(__m128i m) const
 	{
-		return GSVector8i(_mm256_insertf128_si256(this->m, m, i));
+		return GSVector8i(_mm256_inserti128_si256(this->m, m, i));
 	}

 	// TODO: gather
@ -4111,6 +4169,16 @@ public:
 		*/
 	}

+	__forceinline static GSVector8i load(const void* pll, const void* plh, const void* phl, const void* phh)
+	{
+		GSVector4i l = GSVector4i::load(pll, plh);
+		GSVector4i h = GSVector4i::load(phl, phh);
+
+		return cast(l).ac(cast(h));
+
+		// return GSVector8i(l).insert<1>(h);
+	}
+
 	template<bool aligned> __forceinline static GSVector8i load(const void* p)
 	{
 		return GSVector8i(aligned ? _mm256_load_si256((__m256i*)p) : _mm256_loadu_si256((__m256i*)p));
@ -4243,6 +4311,78 @@ public:
 		b = c.bd(d);
 	}

+	__forceinline static void sw4(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		const __m256i epi32_0f0f0f0f = _mm256_set1_epi32(0x0f0f0f0f);
+
+		GSVector8i mask(epi32_0f0f0f0f);
+
+		GSVector8i e = (b << 4).blend(a, mask);
+		GSVector8i f = b.blend(a >> 4, mask);
+		GSVector8i g = (d << 4).blend(c, mask);
+		GSVector8i h = d.blend(c >> 4, mask);
+
+		a = e.upl8(f);
+		c = e.uph8(f);
+		b = g.upl8(h);
+		d = g.uph8(h);
+	}
+
+	__forceinline static void sw8(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		GSVector8i e = a;
+		GSVector8i f = c;
+
+		a = e.upl8(b);
+		c = e.uph8(b);
+		b = f.upl8(d);
+		d = f.uph8(d);
+	}
+
+	__forceinline static void sw16(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		GSVector8i e = a;
+		GSVector8i f = c;
+
+		a = e.upl16(b);
+		c = e.uph16(b);
+		b = f.upl16(d);
+		d = f.uph16(d);
+	}
+
+	__forceinline static void sw32(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		GSVector8i e = a;
+		GSVector8i f = c;
+
+		a = e.upl32(b);
+		c = e.uph32(b);
+		b = f.upl32(d);
+		d = f.uph32(d);
+	}
+
+	__forceinline static void sw64(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		GSVector8i e = a;
+		GSVector8i f = c;
+
+		a = e.upl64(b);
+		c = e.uph64(b);
+		b = f.upl64(d);
+		d = f.uph64(d);
+	}
+
+	__forceinline static void sw128(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		GSVector8i e = a;
+		GSVector8i f = c;
+
+		a = e.ac(b);
+		c = e.bd(b);
+		b = f.ac(d);
+		d = f.bd(d);
+	}
+
 	__forceinline void operator += (const GSVector8i& v)
 	{
 		m = _mm256_add_epi32(m, v);
@ -4706,6 +4846,8 @@ public:
 	static const GSVector8 m_one;
 	static const GSVector8 m_x7fffffff;
 	static const GSVector8 m_x80000000;
+	static const GSVector8 m_x4b000000;
+	static const GSVector8 m_x4f800000;

 	__forceinline GSVector8() 
 	{
@ -4723,9 +4865,7 @@ public:

 	__forceinline GSVector8(__m128 m0, __m128 m1)
 	{
-		// FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps (in vs2012 they simply changed it to vmovups, still can't keep the second xmm in a register)
-
-		#if _MSC_VER >= 1700 
+		#if 0 // _MSC_VER >= 1700 
 		
 		this->m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m0), _mm256_castps128_ps256(m1), 0x20);

@ -4743,23 +4883,27 @@ public:

 	__forceinline explicit GSVector8(float f)
 	{
-		m = _mm256_set1_ps(f);
+		*this = f;
+	}
+
+	__forceinline explicit GSVector8(int i)
+	{
+		#if _M_SSE >= 0x501
+
+		m = _mm256_cvtepi32_ps(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)));
+
+		#else 
+
+		GSVector4i v((int)i);
+
+		*this = GSVector4(v);
+
+		#endif
 	}

 	__forceinline explicit GSVector8(__m128 m)
 	{
-		// FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps
-
-		#if _MSC_VER >= 1700 
-		
-		this->m = _mm256_castps128_ps256(m);
-		this->m = _mm256_permute2f128_ps(this->m, this->m, 0);
-
-		#else
-
-		this->m = zero().insert<0>(m).aa();
-
-		#endif
+		*this = m;
 	}

 	__forceinline explicit GSVector8(__m256 m)
@ -4785,17 +4929,22 @@ public:

 	__forceinline void operator = (float f)
 	{
+		#if _M_SSE >= 0x501
+
+		m =  _mm256_broadcastss_ps(_mm_load_ss(&f));
+
+		#else
+
 		m = _mm256_set1_ps(f);
+
+		#endif
 	}

 	__forceinline void operator = (__m128 m)
 	{
-		// FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps
+		#if 0 // _MSC_VER >= 1700 
 		
-		#if _MSC_VER >= 1700 
-		
-		this->m = _mm256_castps128_ps256(m);
-		this->m = _mm256_permute2f128_ps(this->m, this->m, 0);
+		this->m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m), _mm256_castps128_ps256(m), 0x20);

 		#else

@ -5092,7 +5241,70 @@ public:
 		return _mm256_testz_ps(m, m) != 0;
 	}
 	
-	// TODO: 32-bit insert/extract
+
+	template<int src, int dst> __forceinline GSVector8 insert32(const GSVector8& v) const
+	{
+		// TODO: use blendps when src == dst
+
+		ASSERT(src < 4 && dst < 4); // not cross lane like extract32()
+
+		switch(dst)
+		{
+		case 0:
+			switch(src)
+			{
+			case 0: return yyxx(v).zxzw(*this);
+			case 1: return yyyy(v).zxzw(*this);
+			case 2: return yyzz(v).zxzw(*this);
+			case 3: return yyww(v).zxzw(*this);
+			default: __assume(0);
+			}
+			break;
+		case 1:
+			switch(src)
+			{
+			case 0: return xxxx(v).xzzw(*this);
+			case 1: return xxyy(v).xzzw(*this);
+			case 2: return xxzz(v).xzzw(*this);
+			case 3: return xxww(v).xzzw(*this);
+			default: __assume(0);
+			}
+			break;
+		case 2:
+			switch(src)
+			{
+			case 0: return xyzx(wwxx(v));
+			case 1: return xyzx(wwyy(v));
+			case 2: return xyzx(wwzz(v));
+			case 3: return xyzx(wwww(v));
+			default: __assume(0);
+			}
+			break;
+		case 3:
+			switch(src)
+			{
+			case 0: return xyxz(zzxx(v));
+			case 1: return xyxz(zzyy(v));
+			case 2: return xyxz(zzzz(v));
+			case 3: return xyxz(zzww(v));
+			default: __assume(0);
+			}
+			break;
+		default:
+			__assume(0);
+		}
+
+		return *this;
+	}
+
+	template<int i> __forceinline int extract32() const
+	{
+		if(i < 4) return extract<0>().extract<i>();
+		else if(i < 8) return extract<1>().extract<i - 4>();
+		else ASSERT(0);
+
+		return 0;
+	}

 	template<int i> __forceinline GSVector8 insert(__m128 m) const
 	{
@ -5101,6 +5313,8 @@ public:

 	template<int i> __forceinline GSVector4 extract() const
 	{
+		if(i == 0) return GSVector4(_mm256_castps256_ps128(m));
+
 		return GSVector4(_mm256_extractf128_ps(m, i));
 	}

@ -5114,14 +5328,44 @@ public:
 		return zero() == zero();
 	}

-	// TODO: load low, ss
+	// TODO
+
+	__forceinline static GSVector8 loadl(const void* p)
+	{
+		return GSVector8(_mm256_castps128_ps256(_mm_load_ps((float*)p)));
+	}
+
+	__forceinline static GSVector8 loadh(const void* p)
+	{
+		return zero().insert<1>(_mm_load_ps((float*)p));
+	}
+
+	__forceinline static GSVector8 loadh(const void* p, const GSVector8& v)
+	{
+		return GSVector8(_mm256_insertf128_ps(v, _mm_load_ps((float*)p), 1));
+	}
+
+	__forceinline static GSVector8 load(const void* pl, const void* ph)
+	{
+		return loadh(ph, loadl(pl));
+	}

 	template<bool aligned> __forceinline static GSVector8 load(const void* p)
 	{
 		return GSVector8(aligned ? _mm256_load_ps((const float*)p) : _mm256_loadu_ps((const float*)p));
 	}

-	// TODO: store low, ss
+	// TODO
+
+	__forceinline static void storel(void* p, const GSVector8& v)
+	{
+		_mm_store_ps((float*)p, _mm256_extractf128_ps(v.m, 0));
+	}
+
+	__forceinline static void storeh(void* p, const GSVector8& v)
+	{
+		_mm_store_ps((float*)p, _mm256_extractf128_ps(v.m, 1));
+	}

 	template<bool aligned> __forceinline static void store(void* p, const GSVector8& v)
 	{
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@ -402,6 +402,24 @@ struct aligned_free_second {template<class T> void operator()(T& p) {_aligned_fr
 		return retval;
 	}

+	__forceinline long _InterlockedCompareExchange(volatile long* const Destination, const long Exchange, const long Comperand)
+	{
+		long retval = Comperand;
+		
+		__asm__("lock; cmpxchgl %k[Exchange], %[Destination]" : [retval] "+a" (retval) : [Destination] "m" (*Destination), [Exchange] "q" (Exchange): "memory");
+		
+		return retval;
+	}
+
+	__forceinline long _InterlockedExchange(volatile long* const Target, const long Value)
+	{
+		long retval = Value;
+		
+		__asm__("xchgl %[retval], %[Target]" : [retval] "+r" (retval) : [Target] "m" (*Target) : "memory");
+
+		return retval;
+	}
+
 	__forceinline long _InterlockedExchangeAdd(volatile long* const Addend, const long Value)
 	{
 		long retval = Value;