From ca7abd983aed6762efa41d0c51b4bbbfb84134b5 Mon Sep 17 00:00:00 2001
From: gabest11 <gabest11@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Mon, 7 Feb 2011 01:59:05 +0000
Subject: [PATCH] Mostly code cleanups, XBYAK 2.99, VEX conversion for the sw
 renderer (3-5% faster), GSState::Move fix for dark cloud 2 invention crash.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4287 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/Pcsx2Defs.h                   |    1 +
 plugins/GSdx/GS.cpp                          |  160 +-
 plugins/GSdx/GS.h                            |    6 +-
 plugins/GSdx/GSBlock.h                       |   14 +-
 plugins/GSdx/GSCaptureDlg.cpp                |    4 +-
 plugins/GSdx/GSClut.cpp                      |    4 +-
 plugins/GSdx/GSClut.h                        |    6 +-
 plugins/GSdx/GSDevice.cpp                    |    7 +-
 plugins/GSdx/GSDevice.h                      |    4 +-
 plugins/GSdx/GSDevice11.cpp                  |   33 +-
 plugins/GSdx/GSDevice9.cpp                   |  139 +-
 plugins/GSdx/GSDeviceDX.h                    |    4 +-
 plugins/GSdx/GSDrawScanlineCodeGenerator.cpp | 4037 ++++++++++++------
 plugins/GSdx/GSDrawScanlineCodeGenerator.h   |    4 +-
 plugins/GSdx/GSDrawingContext.h              |    4 +-
 plugins/GSdx/GSDrawingEnvironment.h          |    2 +-
 plugins/GSdx/GSLocalMemory.cpp               |   90 +-
 plugins/GSdx/GSLocalMemory.h                 |   18 +-
 plugins/GSdx/GSRasterizer.cpp                |   60 +-
 plugins/GSdx/GSRasterizer.h                  |    6 +-
 plugins/GSdx/GSRenderer.cpp                  |   16 +-
 plugins/GSdx/GSRenderer.h                    |    7 +-
 plugins/GSdx/GSRendererDX.h                  |    2 +
 plugins/GSdx/GSRendererDX11.cpp              |  186 +-
 plugins/GSdx/GSRendererDX11.h                |    5 +-
 plugins/GSdx/GSRendererDX9.cpp               |  160 +-
 plugins/GSdx/GSRendererDX9.h                 |    5 +-
 plugins/GSdx/GSRendererNull.h                |    9 +-
 plugins/GSdx/GSRendererSW.cpp                |  152 +-
 plugins/GSdx/GSRendererSW.h                  |   11 +-
 plugins/GSdx/GSScanlineEnvironment.h         |    4 +-
 plugins/GSdx/GSSettingsDlg.cpp               |   11 +-
 plugins/GSdx/GSSetupPrimCodeGenerator.cpp    |  694 ++-
 plugins/GSdx/GSState.cpp                     |  466 +-
 plugins/GSdx/GSState.h                       |   80 +-
 plugins/GSdx/GSTables.h                      |    5 +-
 plugins/GSdx/GSTexture.cpp                   |    2 +-
 plugins/GSdx/GSTextureCache.cpp              |    6 +-
 plugins/GSdx/GSTextureCache.h                |    2 +-
 plugins/GSdx/GSTextureCacheSW.cpp            |    2 +-
 plugins/GSdx/GSTextureFX11.cpp               |    1 +
 plugins/GSdx/GSTextureFX9.cpp                |    3 +-
 plugins/GSdx/GSVector.cpp                    |   20 -
 plugins/GSdx/GSVector.h                      | 1533 ++++---
 plugins/GSdx/GSVertex.h                      |    2 +-
 plugins/GSdx/GSVertexHW.h                    |    4 +-
 plugins/GSdx/GSVertexList.h                  |    2 +-
 plugins/GSdx/GSVertexSW.h                    |    7 +-
 plugins/GSdx/GSVertexTrace.cpp               |   78 +-
 plugins/GSdx/GSVertexTrace.h                 |   10 +-
 plugins/GSdx/GSWnd.cpp                       |    1 +
 plugins/GSdx/GSdx.def                        |    2 +-
 plugins/GSdx/stdafx.h                        |    3 +-
 plugins/GSdx/xbyak/xbyak.h                   |  520 ++-
 plugins/GSdx/xbyak/xbyak_mnemonic.h          |  755 +++-
 plugins/GSdx/xbyak/xbyak_util.h              |  185 +-
 56 files changed, 6404 insertions(+), 3150 deletions(-)

diff --git a/common/include/Pcsx2Defs.h b/common/include/Pcsx2Defs.h
index 8843863ba8..0b394c0583 100644
--- a/common/include/Pcsx2Defs.h
+++ b/common/include/Pcsx2Defs.h
@@ -193,6 +193,7 @@ static const int __pagesize	= PCSX2_PAGESIZE;
 
 #	define __aligned(alig)	__declspec(align(alig))
 #	define __aligned16		__declspec(align(16))
+#	define __aligned32		__declspec(align(32))
 #	define __pagealigned	__declspec(align(PCSX2_PAGESIZE))
 
 	// Deprecated; use __align instead.
diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp
index 2d09fb884d..e2bdca3911 100644
--- a/plugins/GSdx/GS.cpp
+++ b/plugins/GSdx/GS.cpp
@@ -153,7 +153,7 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
 {
 	GSDevice* dev = NULL;
 
-	if( renderer == -1 )
+	if(renderer == -1)
 	{
 		renderer = theApp.GetConfig("renderer", 0);
 	}
@@ -167,6 +167,7 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
 			// GSopen call then they'll get corrupted graphics, but that's not my problem.
 
 			delete s_gs;
+
 			s_gs = NULL;
 		}
 
@@ -178,20 +179,25 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
 		case 12: case 13: new GSDeviceNull(); break;
 		}
 
-		if( !dev ) return -1;
+		if(!dev) return -1;
 
-		if( !s_gs )
+		if(!s_gs)
 		{
 			switch(renderer)
 			{
 			default:
-			case 0: s_gs = new GSRendererDX9(); break;
-			case 3: s_gs = new GSRendererDX11(); break;
+			case 0: 
+				s_gs = new GSRendererDX9(); 
+				break;
+			case 3: 
+				s_gs = new GSRendererDX11(); 
+				break;
 			case 2: case 5: case 8: case 11: case 13:
-				s_gs = new GSRendererNull(); break;
-
+				s_gs = new GSRendererNull(); 
+				break;
 			case 1: case 4: case 7: case 10: case 12:
-				s_gs = new GSRendererSW(); break;
+				s_gs = new GSRendererSW(); 
+				break;
 			}
 
 			s_renderer = renderer;
@@ -519,72 +525,6 @@ EXPORT_C GSsetFrameLimit(int limit)
 
 #ifdef _WINDOWS
 
-// Returns false if the window's been closed or an invalid packet was encountered.
-static __forceinline bool LoopDatPacket_Thingamajig(HWND hWnd, uint8 (&regs)[0x2000], vector<uint8>& buff, FILE* fp, long start)
-{
-	switch(fgetc(fp))
-	{
-	case EOF:
-		fseek(fp, start, 0);
-		return !!IsWindowVisible(hWnd);
-
-	case 0:
-	{
-		uint32 index = fgetc(fp);
-		uint32 size;
-
-		fread(&size, 4, 1, fp);
-
-		switch(index)
-		{
-		case 0:
-		{
-			if(buff.size() < 0x4000) buff.resize(0x4000);
-			uint32 addr = 0x4000 - size;
-			fread(&buff[0] + addr, size, 1, fp);
-			GSgifTransfer1(&buff[0], addr);
-		}
-		break;
-
-		case 1:
-			if(buff.size() < size) buff.resize(size);
-			fread(&buff[0], size, 1, fp);
-			GSgifTransfer2(&buff[0], size / 16);
-		break;
-
-		case 2:
-			if(buff.size() < size) buff.resize(size);
-			fread(&buff[0], size, 1, fp);
-			GSgifTransfer3(&buff[0], size / 16);
-		break;
-		}
-	}
-	break;
-
-	case 1:
-		GSvsync(fgetc(fp));
-		return !!IsWindowVisible(hWnd);
-
-	case 2:
-	{
-		uint32 size;
-		fread(&size, 4, 1, fp);
-		if(buff.size() < size) buff.resize(size);
-		GSreadFIFO2(&buff[0], size / 16);
-	}
-	break;
-
-	case 3:
-		fread(regs, 0x2000, 1, fp);
-	break;
-
-	default:
-		return false;
-	}
-
-	return true;
-}
-
 // lpszCmdLine:
 //   First parameter is the renderer.
 //   Second parameter is the gs file to load and run.
@@ -634,7 +574,73 @@ EXPORT_C GSReplay(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow)
 
 		GSvsync(1);
 
-		while( LoopDatPacket_Thingamajig(hWnd, regs, buff, fp, start) ) ;
+		bool exit = false;
+
+		while(!exit)
+		{
+			uint32 index;
+			uint32 size;
+			uint32 addr;
+
+			int pos;
+
+			switch(fgetc(fp))
+			{
+			case EOF:
+				fseek(fp, start, 0);
+				exit = !IsWindowVisible(hWnd);
+				break;
+
+			case 0:
+				index = fgetc(fp);
+				fread(&size, 4, 1, fp);
+
+				switch(index)
+				{
+				case 0:
+					if(buff.size() < 0x4000) buff.resize(0x4000);
+					addr = 0x4000 - size;
+					fread(buff.data() + addr, size, 1, fp);
+					GSgifTransfer1(buff.data(), addr);
+					break;
+
+				case 1:
+					if(buff.size() < size) buff.resize(size);
+					fread(buff.data(), size, 1, fp);
+					GSgifTransfer2(buff.data(), size / 16);
+					break;
+
+				case 2:
+					if(buff.size() < size) buff.resize(size);
+					fread(buff.data(), size, 1, fp);
+					GSgifTransfer3(buff.data(), size / 16);
+					break;
+
+				case 3:
+					if(buff.size() < size) buff.resize(size);
+					fread(buff.data(), size, 1, fp);
+					GSgifTransfer(buff.data(), size / 16);
+					break;
+				}
+
+				break;
+
+			case 1:
+				GSvsync(fgetc(fp));
+				exit = !IsWindowVisible(hWnd);
+				break;
+
+			case 2:
+				fread(&size, 4, 1, fp);
+				if(buff.size() < size) buff.resize(size);
+				GSreadFIFO2(&buff[0], size / 16);
+				break;
+
+			case 3:
+				fread(regs, 0x2000, 1, fp);
+				break;
+			}
+		}
 
 		GSclose();
 		GSshutdown();
@@ -672,7 +678,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 			{PSM_PSMZ16S, "16ZS"},
 		};
 
-		uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 16);
+		uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
 
 		for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i;
 
@@ -809,7 +815,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
 	{
 		GSLocalMemory mem;
 
-		uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 16);
+		uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
 
 		for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i;
 
diff --git a/plugins/GSdx/GS.h b/plugins/GSdx/GS.h
index f01cfc35dd..9d38cee6a0 100644
--- a/plugins/GSdx/GS.h
+++ b/plugins/GSdx/GS.h
@@ -77,6 +77,7 @@ enum GIF_REG
 	GIF_REG_CLAMP_1	= 0x08,
 	GIF_REG_CLAMP_2	= 0x09,
 	GIF_REG_FOG		= 0x0a,
+	GIF_REG_INVALID	= 0x0b,
 	GIF_REG_XYZF3	= 0x0c,
 	GIF_REG_XYZ3	= 0x0d,
 	GIF_REG_A_D		= 0x0e,
@@ -1077,7 +1078,7 @@ REG128_SET(GIFPackedReg)
 	GIFPackedNOP	NOP;
 REG_SET_END
 
-__aligned16 struct GIFPath
+__aligned32 struct GIFPath
 {
 	GIFTag tag;
 	uint32 reg;
@@ -1107,8 +1108,11 @@ __aligned16 struct GIFPath
 		if((++reg & 0xf) == nreg)
 		{
 			reg = 0;
+
 			if(--nloop == 0)
+			{
 				return false;
+			}
 		}
 
 		return true;
diff --git a/plugins/GSdx/GSBlock.h b/plugins/GSdx/GSBlock.h
index 7c9b9496bf..b9f3bfa5dd 100644
--- a/plugins/GSdx/GSBlock.h
+++ b/plugins/GSdx/GSBlock.h
@@ -1201,7 +1201,7 @@ public:
 
 		#else
 /*
-		__aligned16 uint32 block[8 * 8];
+		__aligned32 uint32 block[8 * 8];
 
 		UnpackBlock4HL(src, srcpitch, block);
 
@@ -1316,7 +1316,7 @@ public:
 
 		#else
 /*
-		__aligned16 uint32 block[8 * 8];
+		__aligned32 uint32 block[8 * 8];
 
 		UnpackBlock4HH(src, srcpitch, block);
 
@@ -1467,7 +1467,7 @@ public:
 
 		#else
 
-		__aligned16 uint8 block[16 * 16];
+		__aligned32 uint8 block[16 * 16];
 
 		ReadBlock8<true>(src, (uint8*)block, sizeof(block) / 16);
 
@@ -1542,7 +1542,7 @@ public:
 
 		#else
 
-		__aligned16 uint8 block[(32 / 2) * 16];
+		__aligned32 uint8 block[(32 / 2) * 16];
 
 		ReadBlock4<true>(src, (uint8*)block, sizeof(block) / 16);
 
@@ -1583,7 +1583,7 @@ public:
 
 		#else
 
-		__aligned16 uint32 block[8 * 8];
+		__aligned32 uint32 block[8 * 8];
 
 		ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8);
 
@@ -1624,7 +1624,7 @@ public:
 
 		#else
 
-		__aligned16 uint32 block[8 * 8];
+		__aligned32 uint32 block[8 * 8];
 
 		ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8);
 
@@ -1665,7 +1665,7 @@ public:
 
 		#else
 
-		__aligned16 uint32 block[8 * 8];
+		__aligned32 uint32 block[8 * 8];
 
 		ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8);
 
diff --git a/plugins/GSdx/GSCaptureDlg.cpp b/plugins/GSdx/GSCaptureDlg.cpp
index faa7c5f366..f7fa4363d7 100644
--- a/plugins/GSdx/GSCaptureDlg.cpp
+++ b/plugins/GSdx/GSCaptureDlg.cpp
@@ -68,7 +68,8 @@ void GSCaptureDlg::OnInit()
 
 	ComboBoxAppend(IDC_CODECS, "Uncompressed", 0, true);
 
-	CoInitialize(0);
+	CoInitialize(0); // this is obviously wrong here, each thread should call this on start, and where is CoUninitalize?
+
 	BeginEnumSysDev(CLSID_VideoCompressorCategory, moniker)
 	{
 		Codec c;
@@ -195,6 +196,7 @@ bool GSCaptureDlg::OnCommand(HWND hWnd, UINT id, UINT code)
 		if (ris != 2)
 		{
 			wstring s = wstring(c.DisplayName.m_str);
+
 			theApp.SetConfig("CaptureVideoCodecDisplayName", string(s.begin(), s.end()).c_str());
 		}
 		else
diff --git a/plugins/GSdx/GSClut.cpp b/plugins/GSdx/GSClut.cpp
index 79d68bd27b..32178485a5 100644
--- a/plugins/GSdx/GSClut.cpp
+++ b/plugins/GSdx/GSClut.cpp
@@ -126,7 +126,7 @@ void GSClut::Write(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
 
 void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ASSERT(TEX0.CSA == 0);
 
@@ -135,7 +135,7 @@ void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TE
 
 void GSClut::WriteCLUT32_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ASSERT(TEX0.CSA < 16);
 
diff --git a/plugins/GSdx/GSClut.h b/plugins/GSdx/GSClut.h
index ba344a4672..6967e84536 100644
--- a/plugins/GSdx/GSClut.h
+++ b/plugins/GSdx/GSClut.h
@@ -28,7 +28,7 @@
 
 class GSLocalMemory;
 
-__aligned16 class GSClut : public GSAlignedClass<16>
+__aligned32 class GSClut : public GSAlignedClass<32>
 {
 	GSLocalMemory* m_mem;
 
@@ -37,7 +37,7 @@ __aligned16 class GSClut : public GSAlignedClass<16>
 	uint32* m_buff32;
 	uint64* m_buff64;
 
-	__aligned16 struct WriteState
+	__aligned32 struct WriteState
 	{
 		GIFRegTEX0 TEX0;
 		GIFRegTEXCLUT TEXCLUT;
@@ -45,7 +45,7 @@ __aligned16 class GSClut : public GSAlignedClass<16>
 		bool IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
 	} m_write;
 
-	__aligned16 struct ReadState
+	__aligned32 struct ReadState
 	{
 		GIFRegTEX0 TEX0;
 		GIFRegTEXA TEXA;
diff --git a/plugins/GSdx/GSDevice.cpp b/plugins/GSdx/GSDevice.cpp
index d6bde005b1..02224a81fb 100644
--- a/plugins/GSdx/GSDevice.cpp
+++ b/plugins/GSdx/GSDevice.cpp
@@ -145,8 +145,11 @@ void GSDevice::Recycle(GSTexture* t)
 	if(t)
 	{
 		t->last_frame_used = m_frame;
+
 		m_pool.push_front(t);
+		
 		//printf("%d\n",m_pool.size());
+		
 		while(m_pool.size() > 300)
 		{
 			delete m_pool.back();
@@ -159,9 +162,11 @@ void GSDevice::Recycle(GSTexture* t)
 void GSDevice::AgePool()
 {
 	m_frame++;
-	while (m_pool.size() > 20 && m_frame - m_pool.back()->last_frame_used > 10)
+
+	while(m_pool.size() > 20 && m_frame - m_pool.back()->last_frame_used > 10)
 	{
 		delete m_pool.back();
+
 		m_pool.pop_back();
 	}
 }
diff --git a/plugins/GSdx/GSDevice.h b/plugins/GSdx/GSDevice.h
index b2af6ae9e2..a61ee3f7af 100644
--- a/plugins/GSdx/GSDevice.h
+++ b/plugins/GSdx/GSDevice.h
@@ -46,7 +46,7 @@ struct InterlaceConstantBuffer
 
 #pragma pack(pop)
 
-class GSDevice : public GSAlignedClass<16>
+class GSDevice : public GSAlignedClass<32>
 {
 	list<GSTexture*> m_pool;
 
@@ -66,7 +66,7 @@ protected:
 	struct {size_t stride, start, count, limit;} m_vertices;
 	uint32 m_msaa;
 	DXGI_SAMPLE_DESC m_msaa_desc;
-	unsigned m_frame; // for ageing the pool
+	unsigned int m_frame; // for ageing the pool
 
 	virtual GSTexture* Create(int type, int w, int h, bool msaa, int format) = 0;
 
diff --git a/plugins/GSdx/GSDevice11.cpp b/plugins/GSdx/GSDevice11.cpp
index 196569e410..bfa1639809 100644
--- a/plugins/GSdx/GSDevice11.cpp
+++ b/plugins/GSdx/GSDevice11.cpp
@@ -229,8 +229,10 @@ bool GSDevice11::Create(GSWnd* wnd)
 		}
 	}
 
-	if (m_msaa_desc.Count == 1)
+	if(m_msaa_desc.Count == 1)
+	{
 		m_msaa = 0;
+	}
 
 	// convert
 
@@ -378,7 +380,7 @@ bool GSDevice11::Create(GSWnd* wnd)
 
 	if(m_wnd->IsManaged())
 	{
-		SetExclusive( !theApp.GetConfig("windowed", 1) );
+		SetExclusive(!theApp.GetConfig("windowed", 1));
 	}
 
 	return true;
@@ -392,11 +394,14 @@ bool GSDevice11::Reset(int w, int h)
 	if(m_swapchain)
 	{
 		DXGI_SWAP_CHAIN_DESC scd;
+		
 		memset(&scd, 0, sizeof(scd));
+
 		m_swapchain->GetDesc(&scd);
 		m_swapchain->ResizeBuffers(scd.BufferCount, w, h, scd.BufferDesc.Format, 0);
 
 		CComPtr<ID3D11Texture2D> backbuffer;
+
 		if(FAILED(m_swapchain->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&backbuffer)))
 		{
 			return false;
@@ -422,9 +427,12 @@ void GSDevice11::SetExclusive(bool isExcl)
 	m_swapchain->ResizeTarget(&desc);
 	*/
 
-	HRESULT hr = m_swapchain->SetFullscreenState( isExcl, NULL );
+	HRESULT hr = m_swapchain->SetFullscreenState(isExcl, NULL);
+
 	if(hr == DXGI_ERROR_NOT_CURRENTLY_AVAILABLE)
+	{
 		fprintf(stderr, "(GSdx10) SetExclusive(%s) failed; request unavailable.", isExcl ? "true" : "false");
+	}
 }
 
 void GSDevice11::Flip()
@@ -885,10 +893,13 @@ void GSDevice11::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
 void GSDevice11::PSSetShaderResource(int i, GSTexture* sr)
 {
 	ID3D11ShaderResourceView* srv = NULL;
-	if (sr) srv = *(GSTexture11*)sr;
 
-	if (m_state.ps_srv[i] != srv) {
+	if(sr) srv = *(GSTexture11*)sr;
+
+	if(m_state.ps_srv[i] != srv) 
+	{
 		m_state.ps_srv[i] = srv;
+
 		m_srv_changed = true;
 	}
 }
@@ -914,13 +925,17 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb)
 		m_ctx->PSSetShader(ps, NULL, 0);
 	}
 
-	if (m_srv_changed) {
+	if (m_srv_changed)
+	{
 		m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv);
+	
 		m_srv_changed = false;
 	}
 
-	if (m_ss_changed) {
+	if(m_ss_changed) 
+	{
 		m_ctx->PSSetSamplers(0, 3, m_state.ps_ss);
+	
 		m_ss_changed = false;
 	}
 
@@ -982,8 +997,8 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector
 
 		vp.TopLeftX = 0;
 		vp.TopLeftY = 0;
-		vp.Width = (FLOAT)rt->GetWidth();
-		vp.Height = (FLOAT)rt->GetHeight();
+		vp.Width = (float)rt->GetWidth();
+		vp.Height = (float)rt->GetHeight();
 		vp.MinDepth = 0.0f;
 		vp.MaxDepth = 1.0f;
 
diff --git a/plugins/GSdx/GSDevice9.cpp b/plugins/GSdx/GSDevice9.cpp
index 731fb2390e..ccf6534f16 100644
--- a/plugins/GSdx/GSDevice9.cpp
+++ b/plugins/GSdx/GSDevice9.cpp
@@ -31,7 +31,6 @@ GSDevice9::GSDevice9()
 
 	memset(&m_pp, 0, sizeof(m_pp));
 	memset(&m_d3dcaps, 0, sizeof(m_d3dcaps));
-
 	memset(&m_state, 0, sizeof(m_state));
 
 	m_state.bf = 0xffffffff;
@@ -39,81 +38,109 @@ GSDevice9::GSDevice9()
 
 GSDevice9::~GSDevice9()
 {
-	for_each(m_mskfix.begin(), m_mskfix.end(), delete_second());
-
 	for_each(m_om_bs.begin(), m_om_bs.end(), delete_second());
 	for_each(m_om_dss.begin(), m_om_dss.end(), delete_second());
 	for_each(m_ps_ss.begin(), m_ps_ss.end(), delete_second());
+	for_each(m_mskfix.begin(), m_mskfix.end(), delete_second());
 
 	if(m_state.vs_cb) _aligned_free(m_state.vs_cb);
 	if(m_state.ps_cb) _aligned_free(m_state.ps_cb);
 }
 
+// if supported and null != msaa_desc, msaa_desc will contain requested Count and Quality
+
+static bool IsMsaaSupported(IDirect3D9* d3d, D3DFORMAT depth_format, uint msaaCount, DXGI_SAMPLE_DESC* msaa_desc = NULL)
+{
+	if(msaaCount > 16) return false;
 
-//if supported and null!=msaa_desc,  msaa_desc will contain requested Count and Quality
-static bool IsMsaaSupported(CComPtr<IDirect3D9>& d3d, D3DFORMAT depth_format, uint msaaCount, OUT DXGI_SAMPLE_DESC* msaa_desc=NULL){
 	D3DCAPS9 d3dcaps;
 
-	if (msaaCount>16) return false;
-
 	memset(&d3dcaps, 0, sizeof(d3dcaps));
+
 	d3d->GetDeviceCaps(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, &d3dcaps);
 
 	DWORD quality[2] = {0, 0};
 
-	if(SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, D3DFMT_A8R8G8B8, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[0])) && quality[0] >0
-	&& SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, depth_format, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[1])) && quality[1] >0
-	){
-		if (msaa_desc){
-			msaa_desc->Count	= msaaCount;
-			msaa_desc->Quality	= std::min<DWORD>(quality[0] - 1, quality[1] - 1);
+	if(SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, D3DFMT_A8R8G8B8, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[0])) && quality[0] > 0
+	&& SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, depth_format, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[1])) && quality[1] > 0)
+	{
+		if(msaa_desc)
+		{
+			msaa_desc->Count = msaaCount;
+			msaa_desc->Quality = std::min<DWORD>(quality[0] - 1, quality[1] - 1);
 		}
+
 		return true;
 	}
 
 	return false;
 }
 
-static bool TestDepthFormat(CComPtr<IDirect3D9> &d3d, D3DFORMAT format)
+static bool TestDepthFormat(IDirect3D9* d3d, D3DFORMAT format)
 {
-	if (FAILED(d3d->CheckDeviceFormat(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DUSAGE_DEPTHSTENCIL, D3DRTYPE_SURFACE, format)))
+	if(FAILED(d3d->CheckDeviceFormat(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DUSAGE_DEPTHSTENCIL, D3DRTYPE_SURFACE, format)))
+	{
 		return false;
-	if (FAILED(d3d->CheckDepthStencilMatch(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DFMT_X8R8G8B8, format)))
+	}
+
+	if(FAILED(d3d->CheckDepthStencilMatch(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DFMT_X8R8G8B8, format)))
+	{
 		return false;
+	}
+
 	return true;
 }
 
+static D3DFORMAT BestD3dFormat(IDirect3D9* d3d, int msaaCount = 0, DXGI_SAMPLE_DESC* msaa_desc = NULL)
+{
+	// In descending order of preference
 
-//In descending order of preference
-static D3DFORMAT s_DX9formatsToSearch[]={D3DFMT_D32, D3DFMT_D32F_LOCKABLE, D3DFMT_D24S8};
+	static D3DFORMAT fmts[] = 
+	{
+		D3DFMT_D32, 
+		D3DFMT_D32F_LOCKABLE, 
+		D3DFMT_D24S8
+	};
 
-static D3DFORMAT BestD3dFormat(CComPtr<IDirect3D9>& d3d, int msaaCount=0, OUT DXGI_SAMPLE_DESC* msaa_desc=NULL){
-	if(!d3d) return D3DFMT_UNKNOWN;
-	if (1==msaaCount) msaaCount=0;
+	if(1 == msaaCount) msaaCount = 0;
 
-	for (int i=0; i<sizeof(s_DX9formatsToSearch); i++)
-		if (TestDepthFormat(d3d, s_DX9formatsToSearch[i]) && (!msaaCount || IsMsaaSupported(d3d, s_DX9formatsToSearch[i], msaaCount, msaa_desc)))
-			return s_DX9formatsToSearch[i];
+	for(int i = 0; i < sizeof(fmts); i++)
+	{
+		if(TestDepthFormat(d3d, fmts[i]) && (!msaaCount || IsMsaaSupported(d3d, fmts[i], msaaCount, msaa_desc)))
+		{
+			return fmts[i];
+		}
+	}
 
 	return D3DFMT_UNKNOWN;
 }
 
-//return: 32, 24, or 0 if not supported. if 1==msaa, considered as msaa=0
-uint GSDevice9::GetMaxDepth(uint msaa=0){
+// return: 32, 24, or 0 if not supported. if 1==msaa, considered as msaa=0
+
+uint GSDevice9::GetMaxDepth(uint msaa = 0)
+{
 	CComPtr<IDirect3D9> d3d;
+
 	d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
 
-	D3DFORMAT f=BestD3dFormat(d3d, msaa);
-	switch (f){
-		case D3DFMT_D32: case D3DFMT_D32F_LOCKABLE:	return 32;
-		case D3DFMT_D24S8:							return 24;
+	switch(BestD3dFormat(d3d, msaa))
+	{
+		case D3DFMT_D32: 
+		case D3DFMT_D32F_LOCKABLE:
+			return 32;
+		case D3DFMT_D24S8:
+			return 24;
 	}
+
 	return 0;
 }
 
-void GSDevice9::ForceValidMsaaConfig(){
-		if (0==GetMaxDepth(theApp.GetConfig("msaa", 0)))
-				theApp.SetConfig("msaa", 0);//replace invalid msaa value in ini file with 0.
+void GSDevice9::ForceValidMsaaConfig()
+{
+	if(0 == GetMaxDepth(theApp.GetConfig("msaa", 0)))
+	{
+		theApp.SetConfig("msaa", 0); // replace invalid msaa value in ini file with 0.
+	}
 };
 
 bool GSDevice9::Create(GSWnd* wnd)
@@ -128,17 +155,26 @@ bool GSDevice9::Create(GSWnd* wnd)
 	m_d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
 
 	if(!m_d3d) return false;
+
 	ForceValidMsaaConfig();
-	//Get best format/depth for msaa. Assumption is that if the resulting depth is 24 instead of possible 32,
-	//                                the user was already warned when she selected it. (Lower res z buffer without warning is unacceptable).
-	m_depth_format=BestD3dFormat(m_d3d, m_msaa, &m_msaa_desc);
-	if (D3DFMT_UNKNOWN == m_depth_format){
-		//can't find a format with requested msaa, try without.
-		m_depth_format = BestD3dFormat(m_d3d, 0);
-		if (D3DFMT_UNKNOWN == m_depth_format)
-			return false;
+	
+	// Get best format/depth for msaa. Assumption is that if the resulting depth is 24 instead of possible 32,
+	// the user was already warned when she selected it. (Lower res z buffer without warning is unacceptable).
+	
+	m_depth_format = BestD3dFormat(m_d3d, m_msaa, &m_msaa_desc);
+	
+	if(D3DFMT_UNKNOWN == m_depth_format)
+	{
+		// can't find a format with requested msaa, try without.
 		
-		m_msaa=0;
+		m_depth_format = BestD3dFormat(m_d3d, 0);
+		
+		if(D3DFMT_UNKNOWN == m_depth_format)
+		{
+			return false;
+		}
+
+		m_msaa = 0;
 	}
 
 	memset(&m_d3dcaps, 0, sizeof(m_d3dcaps));
@@ -180,7 +216,6 @@ bool GSDevice9::Create(GSWnd* wnd)
 		return false;
 	}
 
-
 	if(!Reset(1, 1))
 	{
 		return false;
@@ -274,7 +309,8 @@ bool GSDevice9::Create(GSWnd* wnd)
 
 void GSDevice9::SetVsync(bool enable)
 {
-	if( m_vsync == enable ) return;
+	if(m_vsync == enable) return;
+
 	__super::SetVsync(enable);
 
 	// Clever trick:  Delete the backbuffer, so that the next Present will fail and
@@ -282,6 +318,7 @@ void GSDevice9::SetVsync(bool enable)
 	// vsync settings. :)
 
 	delete m_backbuffer;
+
 	m_backbuffer = NULL;
 }
 
@@ -293,6 +330,7 @@ bool GSDevice9::Reset(int w, int h)
 	HRESULT hr;
 
 	int mode = (!m_wnd->IsManaged() || theApp.GetConfig("windowed", 1)) ? Windowed : Fullscreen;
+
 	if(mode == DontCare)
 	{
 		mode = m_pp.Windowed ? Windowed : Fullscreen;
@@ -707,11 +745,11 @@ void GSDevice9::StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt, c
 
 	IASetVertexBuffer(vertices, sizeof(vertices[0]), countof(vertices));
 	IASetPrimitiveTopology(D3DPT_TRIANGLESTRIP);
+	IASetInputLayout(m_convert.il);
 
 	// vs
 
 	VSSetShader(m_convert.vs, NULL, 0);
-	IASetInputLayout(m_convert.il);
 
 	// ps
 
@@ -904,7 +942,7 @@ void GSDevice9::VSSetShader(IDirect3DVertexShader9* vs, const float* vs_cb, int
 			{
 				if(m_state.vs_cb) _aligned_free(m_state.vs_cb);
 
-				m_state.vs_cb = (float*)_aligned_malloc(size, 16);
+				m_state.vs_cb = (float*)_aligned_malloc(size, 32);
 			}
 
 			m_state.vs_cb_len = vs_cb_len;
@@ -926,10 +964,13 @@ void GSDevice9::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
 void GSDevice9::PSSetShaderResource(int i, GSTexture* sr)
 {
 	IDirect3DTexture9* srv = NULL;
-	if (sr) srv = *(GSTexture9*)sr;
 
-	if (m_state.ps_srvs[i] != srv) {
+	if(sr) srv = *(GSTexture9*)sr;
+
+	if(m_state.ps_srvs[i] != srv) 
+	{
 		m_state.ps_srvs[i] = srv;
+	
 		m_dev->SetTexture(i, srv);
 	}
 }
@@ -953,7 +994,7 @@ void GSDevice9::PSSetShader(IDirect3DPixelShader9* ps, const float* ps_cb, int p
 			{
 				if(m_state.ps_cb) _aligned_free(m_state.ps_cb);
 
-				m_state.ps_cb = (float*)_aligned_malloc(size, 16);
+				m_state.ps_cb = (float*)_aligned_malloc(size, 32);
 			}
 
 			m_state.ps_cb_len = ps_cb_len;
diff --git a/plugins/GSdx/GSDeviceDX.h b/plugins/GSdx/GSDeviceDX.h
index aa9d092a28..96f7687cb9 100644
--- a/plugins/GSdx/GSDeviceDX.h
+++ b/plugins/GSdx/GSDeviceDX.h
@@ -30,7 +30,7 @@ class GSDeviceDX : public GSDevice
 public:
 	#pragma pack(push, 1)
 
-	__aligned16 struct VSConstantBuffer
+	__aligned32 struct VSConstantBuffer
 	{
 		GSVector4 VertexScale;
 		GSVector4 VertexOffset;
@@ -86,7 +86,7 @@ public:
 		VSSelector() : key(0) {}
 	};
 
-	__aligned16 struct PSConstantBuffer
+	__aligned32 struct PSConstantBuffer
 	{
 		GSVector4 FogColor_AREF;
 		GSVector4 HalfTexel;
diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp
index 9412299940..ba45718ebb 100644
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp
@@ -104,14 +104,29 @@ L("loop");
 	// xmm6 = ga
 	// xmm7 = test
 
-	if(m_sel.fwrite)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		movdqa(xmm3, xmmword[&m_env.fm]);
-	}
+		if(m_sel.fwrite)
+		{
+			vmovdqa(xmm3, ptr[&m_env.fm]);
+		}
 
-	if(m_sel.zwrite)
+		if(m_sel.zwrite)
+		{
+			vmovdqa(xmm4, ptr[&m_env.zm]);
+		}
+	}
+	else
 	{
-		movdqa(xmm4, xmmword[&m_env.zm]);
+		if(m_sel.fwrite)
+		{
+			movdqa(xmm3, ptr[&m_env.fm]);
+		}
+
+		if(m_sel.zwrite)
+		{
+			movdqa(xmm4, ptr[&m_env.zm]);
+		}
 	}
 
 	// ecx = steps
@@ -177,43 +192,85 @@ L("loop");
 
 	TestDestAlpha();
 
-	// fm |= test;
-	// zm |= test;
-
-	if(m_sel.fwrite)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		por(xmm3, xmm7);
-	}
+		// fm |= test;
+		// zm |= test;
 
-	if(m_sel.zwrite)
+		if(m_sel.fwrite)
+		{
+			vpor(xmm3, xmm7);
+		}
+
+		if(m_sel.zwrite)
+		{
+			vpor(xmm4, xmm7);
+		}
+
+		// int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
+
+		vpcmpeqd(xmm1, xmm1);
+
+		if(m_sel.fwrite && m_sel.zwrite)
+		{
+			vpcmpeqd(xmm0, xmm1, xmm4);
+			vpcmpeqd(xmm1, xmm3);
+			vpackssdw(xmm1, xmm0);
+		}
+		else if(m_sel.fwrite)
+		{
+			vpcmpeqd(xmm1, xmm3);
+			vpackssdw(xmm1, xmm1);
+		}
+		else if(m_sel.zwrite)
+		{
+			vpcmpeqd(xmm1, xmm4);
+			vpackssdw(xmm1, xmm1);
+		}
+
+		vpmovmskb(edx, xmm1);
+		not(edx);
+	}
+	else
 	{
-		por(xmm4, xmm7);
-	}
+		// fm |= test;
+		// zm |= test;
 
-	// int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
+		if(m_sel.fwrite)
+		{
+			por(xmm3, xmm7);
+		}
 
-	pcmpeqd(xmm1, xmm1);
+		if(m_sel.zwrite)
+		{
+			por(xmm4, xmm7);
+		}
 
-	if(m_sel.fwrite && m_sel.zwrite)
-	{
-		movdqa(xmm0, xmm1);
-		pcmpeqd(xmm1, xmm3);
-		pcmpeqd(xmm0, xmm4);
-		packssdw(xmm1, xmm0);
-	}
-	else if(m_sel.fwrite)
-	{
-		pcmpeqd(xmm1, xmm3);
-		packssdw(xmm1, xmm1);
-	}
-	else if(m_sel.zwrite)
-	{
-		pcmpeqd(xmm1, xmm4);
-		packssdw(xmm1, xmm1);
-	}
+		// int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
 
-	pmovmskb(edx, xmm1);
-	not(edx);
+		pcmpeqd(xmm1, xmm1);
+
+		if(m_sel.fwrite && m_sel.zwrite)
+		{
+			movdqa(xmm0, xmm1);
+			pcmpeqd(xmm1, xmm3);
+			pcmpeqd(xmm0, xmm4);
+			packssdw(xmm1, xmm0);
+		}
+		else if(m_sel.fwrite)
+		{
+			pcmpeqd(xmm1, xmm3);
+			packssdw(xmm1, xmm1);
+		}
+		else if(m_sel.zwrite)
+		{
+			pcmpeqd(xmm1, xmm4);
+			packssdw(xmm1, xmm1);
+		}
+
+		pmovmskb(edx, xmm1);
+		not(edx);
+	}
 
 	// ebx = fa
 	// ecx = steps
@@ -262,6 +319,7 @@ L("step");
 	if(!m_sel.edge)
 	{
 		test(ecx, ecx);
+
 		jle("exit", T_NEAR);
 
 		Step();
@@ -298,188 +356,380 @@ void GSDrawScanlineCodeGenerator::Init(int params)
 	sub(ecx, ebx);
 	sub(ecx, 4);
 
-	// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
-
-	shl(edx, 4);
-
-	movdqa(xmm7, xmmword[edx + (size_t)&m_test[0]]);
-
-	mov(eax, ecx);
-	sar(eax, 31);
-	and(eax, ecx);
-	shl(eax, 4);
-
-	por(xmm7, xmmword[eax + (size_t)&m_test[7]]);
-
-	// GSVector2i* fza_base = &m_env.fzbr[top];
-
-	mov(esi, dword[esp + _top]);
-	lea(esi, ptr[esi * 8]);
-	add(esi, dword[&m_env.fzbr]);
-
-	// GSVector2i* fza_offset = &m_env.fzbc[left >> 2];
-
-	lea(edi, ptr[ebx * 2]);
-	add(edi, dword[&m_env.fzbc]);
-
-	if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		// edx = &m_env.d[skip]
+		// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
 
 		shl(edx, 4);
-		lea(edx, ptr[edx + (size_t)m_env.d]);
 
-		// ebx = &v
+		vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
 
-		mov(ebx, dword[esp + _v]);
-	}
+		mov(eax, ecx);
+		sar(eax, 31);
+		and(eax, ecx);
+		shl(eax, 4);
 
-	if(!m_sel.sprite)
-	{
-		if(m_sel.fwrite && m_sel.fge || m_sel.zb)
+		vpor(xmm7, ptr[eax + (size_t)&m_test[7]]);
+
+		// GSVector2i* fza_base = &m_env.fzbr[top];
+
+		mov(esi, dword[esp + _top]);
+		lea(esi, ptr[esi * 8]);
+		add(esi, dword[&m_env.fzbr]);
+
+		// GSVector2i* fza_offset = &m_env.fzbc[left >> 2];
+
+		lea(edi, ptr[ebx * 2]);
+		add(edi, dword[&m_env.fzbc]);
+
+		if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
 		{
-			movaps(xmm0, xmmword[ebx + 16]); // v.p
+			// edx = &m_env.d[skip]
 
-			if(m_sel.fwrite && m_sel.fge)
+			shl(edx, 4);
+			lea(edx, ptr[edx + (size_t)m_env.d]);
+
+			// ebx = &v
+
+			mov(ebx, dword[esp + _v]);
+		}
+
+		if(!m_sel.sprite)
+		{
+			if(m_sel.fwrite && m_sel.fge || m_sel.zb)
 			{
-				// f = GSVector4i(vp).zzzzh().zzzz().add16(m_env.d[skip].f);
+				vmovaps(xmm0, ptr[ebx + 16]); // v.p
 
-				cvttps2dq(xmm1, xmm0);
-				pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
-				pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
-				paddw(xmm1, xmmword[edx + 16 * 6]);
+				if(m_sel.fwrite && m_sel.fge)
+				{
+					// f = GSVector4i(vp).zzzzh().zzzz().add16(m_env.d[skip].f);
 
-				movdqa(xmmword[&m_env.temp.f], xmm1);
+					vcvttps2dq(xmm1, xmm0);
+					vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+					vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+					vpaddw(xmm1, ptr[edx + 16 * 6]);
+
+					vmovdqa(ptr[&m_env.temp.f], xmm1);
+				}
+
+				if(m_sel.zb)
+				{
+					// z = vp.zzzz() + m_env.d[skip].z;
+
+					vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+					vaddps(xmm0, ptr[edx]);
+
+					vmovaps(ptr[&m_env.temp.z], xmm0);
+				}
+			}
+		}
+		else
+		{
+			if(m_sel.ztest)
+			{
+				vmovdqa(xmm0, ptr[&m_env.p.z]);
+			}
+		}
+
+		if(m_sel.fb)
+		{
+			if(m_sel.edge || m_sel.tfx != TFX_NONE)
+			{
+				vmovaps(xmm4, ptr[ebx + 32]); // v.t
 			}
 
-			if(m_sel.zb)
+			if(m_sel.edge)
 			{
-				// z = vp.zzzz() + m_env.d[skip].z;
+				vpshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+				vpshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3));
+				vpsrlw(xmm3, 9);
 
-				shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
-				addps(xmm0, xmmword[edx]);
+				vmovdqa(ptr[&m_env.temp.cov], xmm3);
+			}
 
-				movaps(xmmword[&m_env.temp.z], xmm0);
+			if(m_sel.tfx != TFX_NONE)
+			{
+				if(m_sel.fst)
+				{
+					// GSVector4i vti(vt);
+
+					vcvttps2dq(xmm4, xmm4);
+
+					// si = vti.xxxx() + m_env.d[skip].si;
+					// ti = vti.yyyy(); if(!sprite) ti += m_env.d[skip].ti;
+
+					vpshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+					vpshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+
+					vpaddd(xmm2, ptr[edx + 16 * 7]);
+
+					if(!m_sel.sprite)
+					{
+						vpaddd(xmm3, ptr[edx + 16 * 8]);
+					}
+					else
+					{
+						if(m_sel.ltf)
+						{
+							vmovdqa(xmm4, xmm3);
+							vpshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
+							vpshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
+							vpsrlw(xmm4, 1);
+							vmovdqa(ptr[&m_env.temp.vf], xmm4);
+						}
+					}
+
+					vmovdqa(ptr[&m_env.temp.s], xmm2);
+					vmovdqa(ptr[&m_env.temp.t], xmm3);
+				}
+				else
+				{
+					// s = vt.xxxx() + m_env.d[skip].s;
+					// t = vt.yyyy() + m_env.d[skip].t;
+					// q = vt.zzzz() + m_env.d[skip].q;
+
+					vshufps(xmm2, xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+					vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+					vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+
+					vaddps(xmm2, ptr[edx + 16 * 1]);
+					vaddps(xmm3, ptr[edx + 16 * 2]);
+					vaddps(xmm4, ptr[edx + 16 * 3]);
+
+					vmovaps(ptr[&m_env.temp.s], xmm2);
+					vmovaps(ptr[&m_env.temp.t], xmm3);
+					vmovaps(ptr[&m_env.temp.q], xmm4);
+
+					vrcpps(xmm4, xmm4);
+					vmulps(xmm2, xmm4);
+					vmulps(xmm3, xmm4);
+				}
+			}
+
+			if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+			{
+				if(m_sel.iip)
+				{
+					// GSVector4i vc = GSVector4i(v.c);
+
+					vcvttps2dq(xmm6, ptr[ebx]); // v.c
+
+					// vc = vc.upl16(vc.zwxy());
+
+					vpshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2));
+					vpunpcklwd(xmm6, xmm5);
+
+					// rb = vc.xxxx().add16(m_env.d[skip].rb);
+					// ga = vc.zzzz().add16(m_env.d[skip].ga);
+
+					vpshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+					vpshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
+
+					vpaddw(xmm5, ptr[edx + 16 * 4]);
+					vpaddw(xmm6, ptr[edx + 16 * 5]);
+
+					vmovdqa(ptr[&m_env.temp.rb], xmm5);
+					vmovdqa(ptr[&m_env.temp.ga], xmm6);
+				}
+				else
+				{
+					if(m_sel.tfx == TFX_NONE)
+					{
+						vmovdqa(xmm5, ptr[&m_env.c.rb]);
+						vmovdqa(xmm6, ptr[&m_env.c.ga]);
+					}
+				}
 			}
 		}
 	}
 	else
 	{
-		if(m_sel.ztest)
-		{
-			movdqa(xmm0, xmmword[&m_env.p.z]);
-		}
-	}
+		// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
 
-	if(m_sel.fb)
-	{
-		if(m_sel.edge || m_sel.tfx != TFX_NONE)
+		shl(edx, 4);
+
+		movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
+
+		mov(eax, ecx);
+		sar(eax, 31);
+		and(eax, ecx);
+		shl(eax, 4);
+
+		por(xmm7, ptr[eax + (size_t)&m_test[7]]);
+
+		// GSVector2i* fza_base = &m_env.fzbr[top];
+
+		mov(esi, dword[esp + _top]);
+		lea(esi, ptr[esi * 8]);
+		add(esi, dword[&m_env.fzbr]);
+
+		// GSVector2i* fza_offset = &m_env.fzbc[left >> 2];
+
+		lea(edi, ptr[ebx * 2]);
+		add(edi, dword[&m_env.fzbc]);
+
+		if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
 		{
-			movaps(xmm4, xmmword[ebx + 32]); // v.t
+			// edx = &m_env.d[skip]
+
+			shl(edx, 4);
+			lea(edx, ptr[edx + (size_t)m_env.d]);
+
+			// ebx = &v
+
+			mov(ebx, dword[esp + _v]);
 		}
 
-		if(m_sel.edge)
+		if(!m_sel.sprite)
 		{
-			pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
-			pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3));
-			psrlw(xmm3, 9);
-
-			movdqa(xmmword[&m_env.temp.cov], xmm3);
-		}
-
-		if(m_sel.tfx != TFX_NONE)
-		{
-			if(m_sel.fst)
+			if(m_sel.fwrite && m_sel.fge || m_sel.zb)
 			{
-				// GSVector4i vti(vt);
+				movaps(xmm0, ptr[ebx + 16]); // v.p
 
-				cvttps2dq(xmm4, xmm4);
-
-				// si = vti.xxxx() + m_env.d[skip].si;
-				// ti = vti.yyyy(); if(!sprite) ti += m_env.d[skip].ti;
-
-				pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
-				pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
-
-				paddd(xmm2, xmmword[edx + 16 * 7]);
-
-				if(!m_sel.sprite)
+				if(m_sel.fwrite && m_sel.fge)
 				{
-					paddd(xmm3, xmmword[edx + 16 * 8]);
+					// f = GSVector4i(vp).zzzzh().zzzz().add16(m_env.d[skip].f);
+
+					cvttps2dq(xmm1, xmm0);
+					pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+					pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+					paddw(xmm1, ptr[edx + 16 * 6]);
+
+					movdqa(ptr[&m_env.temp.f], xmm1);
+				}
+
+				if(m_sel.zb)
+				{
+					// z = vp.zzzz() + m_env.d[skip].z;
+
+					shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+					addps(xmm0, ptr[edx]);
+
+					movaps(ptr[&m_env.temp.z], xmm0);
+				}
+			}
+		}
+		else
+		{
+			if(m_sel.ztest)
+			{
+				movdqa(xmm0, ptr[&m_env.p.z]);
+			}
+		}
+
+		if(m_sel.fb)
+		{
+			if(m_sel.edge || m_sel.tfx != TFX_NONE)
+			{
+				movaps(xmm4, ptr[ebx + 32]); // v.t
+
+				//vbroadcastf128(ymm4, ptr[ebx + 32]); // v.t
+				//vzeroupper();
+			}
+
+			if(m_sel.edge)
+			{
+				pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+				pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3));
+				psrlw(xmm3, 9);
+
+				movdqa(ptr[&m_env.temp.cov], xmm3);
+			}
+
+			if(m_sel.tfx != TFX_NONE)
+			{
+				if(m_sel.fst)
+				{
+					// GSVector4i vti(vt);
+
+					cvttps2dq(xmm4, xmm4);
+
+					// si = vti.xxxx() + m_env.d[skip].si;
+					// ti = vti.yyyy(); if(!sprite) ti += m_env.d[skip].ti;
+
+					pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+					pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+
+					paddd(xmm2, ptr[edx + 16 * 7]);
+
+					if(!m_sel.sprite)
+					{
+						paddd(xmm3, ptr[edx + 16 * 8]);
+					}
+					else
+					{
+						if(m_sel.ltf)
+						{
+							movdqa(xmm4, xmm3);
+							pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
+							pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
+							psrlw(xmm4, 1);
+							movdqa(ptr[&m_env.temp.vf], xmm4);
+						}
+					}
+
+					movdqa(ptr[&m_env.temp.s], xmm2);
+					movdqa(ptr[&m_env.temp.t], xmm3);
 				}
 				else
 				{
-					if(m_sel.ltf)
-					{
-						movdqa(xmm4, xmm3);
-						pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
-						pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
-						psrlw(xmm4, 1);
-						movdqa(xmmword[&m_env.temp.vf], xmm4);
-					}
+					// s = vt.xxxx() + m_env.d[skip].s;
+					// t = vt.yyyy() + m_env.d[skip].t;
+					// q = vt.zzzz() + m_env.d[skip].q;
+
+					movaps(xmm2, xmm4);
+					movaps(xmm3, xmm4);
+
+					shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
+					shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
+					shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+
+					addps(xmm2, ptr[edx + 16 * 1]);
+					addps(xmm3, ptr[edx + 16 * 2]);
+					addps(xmm4, ptr[edx + 16 * 3]);
+
+					movaps(ptr[&m_env.temp.s], xmm2);
+					movaps(ptr[&m_env.temp.t], xmm3);
+					movaps(ptr[&m_env.temp.q], xmm4);
+
+					rcpps(xmm4, xmm4);
+					mulps(xmm2, xmm4);
+					mulps(xmm3, xmm4);
 				}
-
-				movdqa(xmmword[&m_env.temp.s], xmm2);
-				movdqa(xmmword[&m_env.temp.t], xmm3);
 			}
-			else
+
+			if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
 			{
-				// s = vt.xxxx() + m_env.d[skip].s;
-				// t = vt.yyyy() + m_env.d[skip].t;
-				// q = vt.zzzz() + m_env.d[skip].q;
-
-				movaps(xmm2, xmm4);
-				movaps(xmm3, xmm4);
-
-				shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
-				shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
-				shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
-
-				addps(xmm2, xmmword[edx + 16 * 1]);
-				addps(xmm3, xmmword[edx + 16 * 2]);
-				addps(xmm4, xmmword[edx + 16 * 3]);
-
-				movaps(xmmword[&m_env.temp.s], xmm2);
-				movaps(xmmword[&m_env.temp.t], xmm3);
-				movaps(xmmword[&m_env.temp.q], xmm4);
-
-				rcpps(xmm4, xmm4);
-				mulps(xmm2, xmm4);
-				mulps(xmm3, xmm4);
-			}
-		}
-
-		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
-		{
-			if(m_sel.iip)
-			{
-				// GSVector4i vc = GSVector4i(v.c);
-
-				cvttps2dq(xmm6, xmmword[ebx]); // v.c
-
-				// vc = vc.upl16(vc.zwxy());
-
-				pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2));
-				punpcklwd(xmm6, xmm5);
-
-				// rb = vc.xxxx().add16(m_env.d[skip].rb);
-				// ga = vc.zzzz().add16(m_env.d[skip].ga);
-
-				pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
-				pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
-
-				paddw(xmm5, xmmword[edx + 16 * 4]);
-				paddw(xmm6, xmmword[edx + 16 * 5]);
-
-				movdqa(xmmword[&m_env.temp.rb], xmm5);
-				movdqa(xmmword[&m_env.temp.ga], xmm6);
-			}
-			else
-			{
-				if(m_sel.tfx == TFX_NONE)
+				if(m_sel.iip)
 				{
-					movdqa(xmm5, xmmword[&m_env.c.rb]);
-					movdqa(xmm6, xmmword[&m_env.c.ga]);
+					// GSVector4i vc = GSVector4i(v.c);
+
+					cvttps2dq(xmm6, ptr[ebx]); // v.c
+
+					// vc = vc.upl16(vc.zwxy());
+
+					pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2));
+					punpcklwd(xmm6, xmm5);
+
+					// rb = vc.xxxx().add16(m_env.d[skip].rb);
+					// ga = vc.zzzz().add16(m_env.d[skip].ga);
+
+					pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+					pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
+
+					paddw(xmm5, ptr[edx + 16 * 4]);
+					paddw(xmm6, ptr[edx + 16 * 5]);
+
+					movdqa(ptr[&m_env.temp.rb], xmm5);
+					movdqa(ptr[&m_env.temp.ga], xmm6);
+				}
+				else
+				{
+					if(m_sel.tfx == TFX_NONE)
+					{
+						movdqa(xmm5, ptr[&m_env.c.rb]);
+						movdqa(xmm6, ptr[&m_env.c.ga]);
+					}
 				}
 			}
 		}
@@ -496,131 +746,260 @@ void GSDrawScanlineCodeGenerator::Step()
 
 	add(edi, 8);
 
-	if(!m_sel.sprite)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		// z += m_env.d4.z;
-
-		if(m_sel.zb)
+		if(!m_sel.sprite)
 		{
-			movaps(xmm0, xmmword[&m_env.temp.z]);
-			addps(xmm0, xmmword[&m_env.d4.z]);
-			movaps(xmmword[&m_env.temp.z], xmm0);
-		}
+			// z += m_env.d4.z;
 
-		// f = f.add16(m_env.d4.f);
-
-		if(m_sel.fwrite && m_sel.fge)
-		{
-			movdqa(xmm1, xmmword[&m_env.temp.f]);
-			paddw(xmm1, xmmword[&m_env.d4.f]);
-			movdqa(xmmword[&m_env.temp.f], xmm1);
-		}
-	}
-	else
-	{
-		if(m_sel.ztest)
-		{
-			movdqa(xmm0, xmmword[&m_env.p.z]);
-		}
-	}
-
-	if(m_sel.fb)
-	{
-		if(m_sel.tfx != TFX_NONE)
-		{
-			if(m_sel.fst)
+			if(m_sel.zb)
 			{
-				// GSVector4i st = m_env.d4.st;
+				vmovaps(xmm0, ptr[&m_env.temp.z]);
+				vaddps(xmm0, ptr[&m_env.d4.z]);
+				vmovaps(ptr[&m_env.temp.z], xmm0);
+			}
 
-				// si += st.xxxx();
-				// if(!sprite) ti += st.yyyy();
+			// f = f.add16(m_env.d4.f);
 
-				movdqa(xmm4, xmmword[&m_env.d4.st]);
+			if(m_sel.fwrite && m_sel.fge)
+			{
+				vmovdqa(xmm1, ptr[&m_env.temp.f]);
+				vpaddw(xmm1, ptr[&m_env.d4.f]);
+				vmovdqa(ptr[&m_env.temp.f], xmm1);
+			}
+		}
+		else
+		{
+			if(m_sel.ztest)
+			{
+				vmovdqa(xmm0, ptr[&m_env.p.z]);
+			}
+		}
 
-				pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
-				paddd(xmm2, xmmword[&m_env.temp.s]);
-				movdqa(xmmword[&m_env.temp.s], xmm2);
-
-				if(!m_sel.sprite)
+		if(m_sel.fb)
+		{
+			if(m_sel.tfx != TFX_NONE)
+			{
+				if(m_sel.fst)
 				{
-					pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
-					paddd(xmm3, xmmword[&m_env.temp.t]);
-					movdqa(xmmword[&m_env.temp.t], xmm3);
+					// GSVector4i st = m_env.d4.st;
+
+					// si += st.xxxx();
+					// if(!sprite) ti += st.yyyy();
+
+					vmovdqa(xmm4, ptr[&m_env.d4.st]);
+
+					vpshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+					vpaddd(xmm2, ptr[&m_env.temp.s]);
+					vmovdqa(ptr[&m_env.temp.s], xmm2);
+
+					if(!m_sel.sprite)
+					{
+						vpshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+						vpaddd(xmm3, ptr[&m_env.temp.t]);
+						vmovdqa(ptr[&m_env.temp.t], xmm3);
+					}
+					else
+					{
+						vmovdqa(xmm3, ptr[&m_env.temp.t]);
+					}
 				}
 				else
 				{
-					movdqa(xmm3, xmmword[&m_env.temp.t]);
+					// GSVector4 stq = m_env.d4.stq;
+
+					// s += stq.xxxx();
+					// t += stq.yyyy();
+					// q += stq.zzzz();
+
+					vmovaps(xmm4, ptr[&m_env.d4.stq]);
+				
+					vshufps(xmm2, xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+					vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+					vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+
+					vaddps(xmm2, ptr[&m_env.temp.s]);
+					vaddps(xmm3, ptr[&m_env.temp.t]);
+					vaddps(xmm4, ptr[&m_env.temp.q]);
+
+					vmovaps(ptr[&m_env.temp.s], xmm2);
+					vmovaps(ptr[&m_env.temp.t], xmm3);
+					vmovaps(ptr[&m_env.temp.q], xmm4);
+
+					vrcpps(xmm4, xmm4);
+					vmulps(xmm2, xmm4);
+					vmulps(xmm3, xmm4);
 				}
 			}
-			else
+
+			if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
 			{
-				// GSVector4 stq = m_env.d4.stq;
-
-				// s += stq.xxxx();
-				// t += stq.yyyy();
-				// q += stq.zzzz();
-
-				movaps(xmm2, xmmword[&m_env.d4.stq]);
-				movaps(xmm3, xmm2);
-				movaps(xmm4, xmm2);
-
-				shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
-				shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
-				shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
-
-				addps(xmm2, xmmword[&m_env.temp.s]);
-				addps(xmm3, xmmword[&m_env.temp.t]);
-				addps(xmm4, xmmword[&m_env.temp.q]);
-
-				movaps(xmmword[&m_env.temp.s], xmm2);
-				movaps(xmmword[&m_env.temp.t], xmm3);
-				movaps(xmmword[&m_env.temp.q], xmm4);
-
-				rcpps(xmm4, xmm4);
-				mulps(xmm2, xmm4);
-				mulps(xmm3, xmm4);
-			}
-		}
-
-		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
-		{
-			if(m_sel.iip)
-			{
-				// GSVector4i c = m_env.d4.c;
-
-				// rb = rb.add16(c.xxxx());
-				// ga = ga.add16(c.yyyy());
-
-				movdqa(xmm7, xmmword[&m_env.d4.c]);
-
-				pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
-				pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1));
-
-				paddw(xmm5, xmmword[&m_env.temp.rb]);
-				paddw(xmm6, xmmword[&m_env.temp.ga]);
-
-				movdqa(xmmword[&m_env.temp.rb], xmm5);
-				movdqa(xmmword[&m_env.temp.ga], xmm6);
-			}
-			else
-			{
-				if(m_sel.tfx == TFX_NONE)
+				if(m_sel.iip)
 				{
-					movdqa(xmm5, xmmword[&m_env.c.rb]);
-					movdqa(xmm6, xmmword[&m_env.c.ga]);
+					// GSVector4i c = m_env.d4.c;
+
+					// rb = rb.add16(c.xxxx());
+					// ga = ga.add16(c.yyyy());
+
+					vmovdqa(xmm7, ptr[&m_env.d4.c]);
+
+					vpshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
+					vpshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1));
+
+					vpaddw(xmm5, ptr[&m_env.temp.rb]);
+					vpaddw(xmm6, ptr[&m_env.temp.ga]);
+
+					vmovdqa(ptr[&m_env.temp.rb], xmm5);
+					vmovdqa(ptr[&m_env.temp.ga], xmm6);
+				}
+				else
+				{
+					if(m_sel.tfx == TFX_NONE)
+					{
+						vmovdqa(xmm5, ptr[&m_env.c.rb]);
+						vmovdqa(xmm6, ptr[&m_env.c.ga]);
+					}
 				}
 			}
 		}
+
+		// test = m_test[7 + (steps & (steps >> 31))];
+
+		mov(edx, ecx);
+		sar(edx, 31);
+		and(edx, ecx);
+		shl(edx, 4);
+
+		vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
 	}
+	else
+	{
+		if(!m_sel.sprite)
+		{
+			// z += m_env.d4.z;
 
-	// test = m_test[7 + (steps & (steps >> 31))];
+			if(m_sel.zb)
+			{
+				movaps(xmm0, ptr[&m_env.temp.z]);
+				addps(xmm0, ptr[&m_env.d4.z]);
+				movaps(ptr[&m_env.temp.z], xmm0);
+			}
 
-	mov(edx, ecx);
-	sar(edx, 31);
-	and(edx, ecx);
-	shl(edx, 4);
+			// f = f.add16(m_env.d4.f);
 
-	movdqa(xmm7, xmmword[edx + (size_t)&m_test[7]]);
+			if(m_sel.fwrite && m_sel.fge)
+			{
+				movdqa(xmm1, ptr[&m_env.temp.f]);
+				paddw(xmm1, ptr[&m_env.d4.f]);
+				movdqa(ptr[&m_env.temp.f], xmm1);
+			}
+		}
+		else
+		{
+			if(m_sel.ztest)
+			{
+				movdqa(xmm0, ptr[&m_env.p.z]);
+			}
+		}
+
+		if(m_sel.fb)
+		{
+			if(m_sel.tfx != TFX_NONE)
+			{
+				if(m_sel.fst)
+				{
+					// GSVector4i st = m_env.d4.st;
+
+					// si += st.xxxx();
+					// if(!sprite) ti += st.yyyy();
+
+					movdqa(xmm4, ptr[&m_env.d4.st]);
+
+					pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+					paddd(xmm2, ptr[&m_env.temp.s]);
+					movdqa(ptr[&m_env.temp.s], xmm2);
+
+					if(!m_sel.sprite)
+					{
+						pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+						paddd(xmm3, ptr[&m_env.temp.t]);
+						movdqa(ptr[&m_env.temp.t], xmm3);
+					}
+					else
+					{
+						movdqa(xmm3, ptr[&m_env.temp.t]);
+					}
+				}
+				else
+				{
+					// GSVector4 stq = m_env.d4.stq;
+
+					// s += stq.xxxx();
+					// t += stq.yyyy();
+					// q += stq.zzzz();
+
+					movaps(xmm2, ptr[&m_env.d4.stq]);
+					movaps(xmm3, xmm2);
+					movaps(xmm4, xmm2);
+
+					shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
+					shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
+					shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+
+					addps(xmm2, ptr[&m_env.temp.s]);
+					addps(xmm3, ptr[&m_env.temp.t]);
+					addps(xmm4, ptr[&m_env.temp.q]);
+
+					movaps(ptr[&m_env.temp.s], xmm2);
+					movaps(ptr[&m_env.temp.t], xmm3);
+					movaps(ptr[&m_env.temp.q], xmm4);
+
+					rcpps(xmm4, xmm4);
+					mulps(xmm2, xmm4);
+					mulps(xmm3, xmm4);
+				}
+			}
+
+			if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+			{
+				if(m_sel.iip)
+				{
+					// GSVector4i c = m_env.d4.c;
+
+					// rb = rb.add16(c.xxxx());
+					// ga = ga.add16(c.yyyy());
+
+					movdqa(xmm7, ptr[&m_env.d4.c]);
+
+					pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
+					pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1));
+
+					paddw(xmm5, ptr[&m_env.temp.rb]);
+					paddw(xmm6, ptr[&m_env.temp.ga]);
+
+					movdqa(ptr[&m_env.temp.rb], xmm5);
+					movdqa(ptr[&m_env.temp.ga], xmm6);
+				}
+				else
+				{
+					if(m_sel.tfx == TFX_NONE)
+					{
+						movdqa(xmm5, ptr[&m_env.c.rb]);
+						movdqa(xmm6, ptr[&m_env.c.ga]);
+					}
+				}
+			}
+		}
+
+		// test = m_test[7 + (steps & (steps >> 31))];
+
+		mov(edx, ecx);
+		sar(edx, 31);
+		and(edx, ecx);
+		shl(edx, 4);
+
+		movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
+	}
 }
 
 void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
@@ -635,93 +1014,186 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
 	mov(ebp, dword[esi + 4]);
 	add(ebp, dword[edi + 4]);
 
-	// GSVector4i zs = zi;
-
-	if(!m_sel.sprite)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		if(m_sel.zoverflow)
+		// GSVector4i zs = zi;
+
+		if(!m_sel.sprite)
 		{
-			// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+			if(m_sel.zoverflow)
+			{
+				// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
 
-			static float half = 0.5f;
+				static float half = 0.5f;
 
-			movss(temp1, dword[&half]);
-			shufps(temp1, temp1, _MM_SHUFFLE(0, 0, 0, 0));
-			mulps(temp1, xmm0);
-			cvttps2dq(temp1, temp1);
-			pslld(temp1, 1);
+				vbroadcastss(temp1, dword[&half]);
+				vmulps(temp1, xmm0);
+				vcvttps2dq(temp1, temp1);
+				vpslld(temp1, 1);
 
-			cvttps2dq(xmm0, xmm0);
-			pcmpeqd(temp2, temp2);
-			psrld(temp2, 31);
-			pand(xmm0, temp2);
+				vcvttps2dq(xmm0, xmm0);
+				vpcmpeqd(temp2, temp2);
+				vpsrld(temp2, 31);
+				vpand(xmm0, temp2);
 
-			por(xmm0, temp1);
-		}
-		else
-		{
-			// zs = GSVector4i(z);
+				vpor(xmm0, temp1);
+			}
+			else
+			{
+				// zs = GSVector4i(z);
 
-			cvttps2dq(xmm0, xmm0);
+				vcvttps2dq(xmm0, xmm0);
+			}
+
+			if(m_sel.zwrite)
+			{
+				vmovdqa(ptr[&m_env.temp.zs], xmm0);
+			}
 		}
 
-		if(m_sel.zwrite)
+		if(m_sel.ztest)
 		{
-			movdqa(xmmword[&m_env.temp.zs], xmm0);
+			ReadPixel(xmm1, ebp);
+
+			if(m_sel.zwrite && m_sel.zpsm < 2)
+			{
+				vmovdqa(ptr[&m_env.temp.zd], xmm1);
+			}
+
+			// zd &= 0xffffffff >> m_sel.zpsm * 8;
+
+			if(m_sel.zpsm)
+			{
+				vpslld(xmm1, m_sel.zpsm * 8);
+				vpsrld(xmm1, m_sel.zpsm * 8);
+			}
+
+			if(m_sel.zoverflow || m_sel.zpsm == 0)
+			{
+				// GSVector4i o = GSVector4i::x80000000();
+
+				vpcmpeqd(xmm4, xmm4);
+				vpslld(xmm4, 31);
+
+				// GSVector4i zso = zs - o;
+
+				vpsubd(xmm0, xmm4);
+
+				// GSVector4i zdo = zd - o;
+
+				vpsubd(xmm1, xmm4);
+			}
+
+			switch(m_sel.ztst)
+			{
+			case ZTST_GEQUAL:
+				// test |= zso < zdo; // ~(zso >= zdo)
+				vpcmpgtd(xmm1, xmm0);
+				vpor(xmm7, xmm1);
+				break;
+
+			case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
+				// test |= zso <= zdo; // ~(zso > zdo)
+				vpcmpgtd(xmm0, xmm1);
+				vpcmpeqd(xmm4, xmm4);
+				vpxor(xmm0, xmm4);
+				vpor(xmm7, xmm0);
+				break;
+			}
+
+			alltrue();
 		}
 	}
-
-	if(m_sel.ztest)
+	else
 	{
-		ReadPixel(xmm1, ebp);
+		// GSVector4i zs = zi;
 
-		if(m_sel.zwrite && m_sel.zpsm < 2)
+		if(!m_sel.sprite)
 		{
-			movdqa(xmmword[&m_env.temp.zd], xmm1);
+			if(m_sel.zoverflow)
+			{
+				// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+
+				static float half = 0.5f;
+
+				movss(temp1, dword[&half]);
+				shufps(temp1, temp1, _MM_SHUFFLE(0, 0, 0, 0));
+				mulps(temp1, xmm0);
+				cvttps2dq(temp1, temp1);
+				pslld(temp1, 1);
+
+				cvttps2dq(xmm0, xmm0);
+				pcmpeqd(temp2, temp2);
+				psrld(temp2, 31);
+				pand(xmm0, temp2);
+
+				por(xmm0, temp1);
+			}
+			else
+			{
+				// zs = GSVector4i(z);
+
+				cvttps2dq(xmm0, xmm0);
+			}
+
+			if(m_sel.zwrite)
+			{
+				movdqa(ptr[&m_env.temp.zs], xmm0);
+			}
 		}
 
-		// zd &= 0xffffffff >> m_sel.zpsm * 8;
-
-		if(m_sel.zpsm)
+		if(m_sel.ztest)
 		{
-			pslld(xmm1, m_sel.zpsm * 8);
-			psrld(xmm1, m_sel.zpsm * 8);
+			ReadPixel(xmm1, ebp);
+
+			if(m_sel.zwrite && m_sel.zpsm < 2)
+			{
+				movdqa(ptr[&m_env.temp.zd], xmm1);
+			}
+
+			// zd &= 0xffffffff >> m_sel.zpsm * 8;
+
+			if(m_sel.zpsm)
+			{
+				pslld(xmm1, m_sel.zpsm * 8);
+				psrld(xmm1, m_sel.zpsm * 8);
+			}
+
+			if(m_sel.zoverflow || m_sel.zpsm == 0)
+			{
+				// GSVector4i o = GSVector4i::x80000000();
+
+				pcmpeqd(xmm4, xmm4);
+				pslld(xmm4, 31);
+
+				// GSVector4i zso = zs - o;
+
+				psubd(xmm0, xmm4);
+
+				// GSVector4i zdo = zd - o;
+
+				psubd(xmm1, xmm4);
+			}
+
+			switch(m_sel.ztst)
+			{
+			case ZTST_GEQUAL:
+				// test |= zso < zdo; // ~(zso >= zdo)
+				pcmpgtd(xmm1, xmm0);
+				por(xmm7, xmm1);
+				break;
+
+			case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
+				// test |= zso <= zdo; // ~(zso > zdo)
+				pcmpgtd(xmm0, xmm1);
+				pcmpeqd(xmm4, xmm4);
+				pxor(xmm0, xmm4);
+				por(xmm7, xmm0);
+				break;
+			}
+
+			alltrue();
 		}
-
-		if(m_sel.zoverflow || m_sel.zpsm == 0)
-		{
-			// GSVector4i o = GSVector4i::x80000000();
-
-			pcmpeqd(xmm4, xmm4);
-			pslld(xmm4, 31);
-
-			// GSVector4i zso = zs - o;
-
-			psubd(xmm0, xmm4);
-
-			// GSVector4i zdo = zd - o;
-
-			psubd(xmm1, xmm4);
-		}
-
-		switch(m_sel.ztst)
-		{
-		case ZTST_GEQUAL:
-			// test |= zso < zdo; // ~(zso >= zdo)
-			pcmpgtd(xmm1, xmm0);
-			por(xmm7, xmm1);
-			break;
-
-		case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
-			// test |= zso <= zdo; // ~(zso > zdo)
-			pcmpgtd(xmm0, xmm1);
-			pcmpeqd(xmm4, xmm4);
-			pxor(xmm0, xmm4);
-			por(xmm7, xmm0);
-			break;
-		}
-
-		alltrue();
 	}
 }
 
@@ -736,272 +1208,531 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
 
 	// ebx = tex
 
-	if(!m_sel.fst)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		// TODO: move these into Init/Step too?
+		if(!m_sel.fst)
+		{
+			// TODO: move these into Init/Step too?
 
-		cvttps2dq(xmm2, xmm2);
-		cvttps2dq(xmm3, xmm3);
+			vcvttps2dq(xmm2, xmm2);
+			vcvttps2dq(xmm3, xmm3);
+
+			if(m_sel.ltf)
+			{
+				// u -= 0x8000;
+				// v -= 0x8000;
+
+				mov(eax, 0x8000);
+				vmovd(xmm4, eax);
+				vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+				vpsubd(xmm2, xmm4);
+				vpsubd(xmm3, xmm4);
+			}
+		}
+
+		// xmm2 = u
+		// xmm3 = v
 
 		if(m_sel.ltf)
 		{
-			// u -= 0x8000;
-			// v -= 0x8000;
+			// GSVector4i uf = u.xxzzlh().srl16(1);
 
-			mov(eax, 0x8000);
-			movd(xmm4, eax);
-			pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
-			psubd(xmm2, xmm4);
-			psubd(xmm3, xmm4);
+			vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			vpsrlw(xmm0, 1);
+			vmovdqa(ptr[&m_env.temp.uf], xmm0);
+
+			if(!m_sel.sprite)
+			{
+				// GSVector4i vf = v.xxzzlh().srl16(1);
+
+				vpshuflw(xmm1, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
+				vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0));
+				vpsrlw(xmm1, 1);
+				vmovdqa(ptr[&m_env.temp.vf], xmm1);
+			}
 		}
-	}
 
-	// xmm2 = u
-	// xmm3 = v
+		// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
 
-	if(m_sel.ltf)
-	{
-		// GSVector4i uf = u.xxzzlh().srl16(1);
+		vpsrad(xmm2, 16);
+		vpsrad(xmm3, 16);
+		vpackssdw(xmm2, xmm3);
 
-		movdqa(xmm0, xmm2);
-		pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
-		pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
-		psrlw(xmm0, 1);
-		movdqa(xmmword[&m_env.temp.uf], xmm0);
-
-		if(!m_sel.sprite)
+		if(m_sel.ltf)
 		{
-			// GSVector4i vf = v.xxzzlh().srl16(1);
+			// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
 
-			movdqa(xmm1, xmm3);
-			pshuflw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0));
-			pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0));
-			psrlw(xmm1, 1);
-			movdqa(xmmword[&m_env.temp.vf], xmm1);
+			vpcmpeqd(xmm1, xmm1);
+			vpsrlw(xmm1, 15);
+			vpaddw(xmm3, xmm2, xmm1);
+
+			// uv0 = Wrap(uv0);
+			// uv1 = Wrap(uv1);
+
+			Wrap(xmm2, xmm3);
+		}
+		else
+		{
+			// uv0 = Wrap(uv0);
+
+			Wrap(xmm2);
+		}
+
+		// xmm2 = uv0
+		// xmm3 = uv1 (ltf)
+		// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i x0 = uv0.upl16();
+		// GSVector4i y0 = uv0.uph16() << tw;
+
+		vpxor(xmm0, xmm0);
+		vmovd(xmm1, ptr[&m_env.tw]);
+
+		vpunpcklwd(xmm4, xmm2, xmm0);
+		vpunpckhwd(xmm2, xmm2, xmm0);
+		vpslld(xmm2, xmm1);
+
+		// xmm0 = 0
+		// xmm1 = tw
+		// xmm2 = y0
+		// xmm3 = uv1 (ltf)
+		// xmm4 = x0
+		// xmm5, xmm6 = free
+		// xmm7 = used
+
+		if(m_sel.ltf)
+		{
+			// GSVector4i x1 = uv1.upl16();
+			// GSVector4i y1 = uv1.uph16() << tw;
+
+			vpunpcklwd(xmm6, xmm3, xmm0);
+			vpunpckhwd(xmm3, xmm3, xmm0);
+			vpslld(xmm3, xmm1);
+
+			// xmm2 = y0
+			// xmm3 = y1
+			// xmm4 = x0
+			// xmm6 = x1
+			// xmm0, xmm5, xmm6 = free
+			// xmm7 = used
+
+			// GSVector4i addr00 = y0 + x0;
+			// GSVector4i addr01 = y0 + x1;
+			// GSVector4i addr10 = y1 + x0;
+			// GSVector4i addr11 = y1 + x1;
+
+			vpaddd(xmm5, xmm2, xmm4);
+			vpaddd(xmm2, xmm2, xmm6);
+			vpaddd(xmm0, xmm3, xmm4);
+			vpaddd(xmm3, xmm3, xmm6);
+
+			// xmm5 = addr00
+			// xmm2 = addr01
+			// xmm0 = addr10
+			// xmm3 = addr11
+			// xmm1, xmm4, xmm6 = free
+			// xmm7 = used
+
+			// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+
+			ReadTexel(xmm6, xmm5, xmm1, xmm4);
+
+			// xmm2, xmm5, xmm1 = free
+
+			ReadTexel(xmm4, xmm2, xmm5, xmm1);
+
+			// xmm0, xmm2, xmm5 = free
+
+			ReadTexel(xmm1, xmm0, xmm2, xmm5);
+
+			// xmm3, xmm0, xmm2 = free
+
+			ReadTexel(xmm5, xmm3, xmm0, xmm2);
+
+			// xmm6 = c00
+			// xmm4 = c01
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm0, xmm2, xmm3 = free
+			// xmm7 = used
+
+			vmovdqa(xmm0, ptr[&m_env.temp.uf]);
+
+			// GSVector4i rb00 = c00 & mask;
+			// GSVector4i ga00 = (c00 >> 8) & mask;
+
+			vpsllw(xmm2, xmm6, 8);
+			vpsrlw(xmm2, 8);
+			vpsrlw(xmm6, 8);
+
+			// GSVector4i rb01 = c01 & mask;
+			// GSVector4i ga01 = (c01 >> 8) & mask;
+
+			vpsllw(xmm3, xmm4, 8);
+			vpsrlw(xmm3, 8);
+			vpsrlw(xmm4, 8);
+
+			// xmm0 = uf
+			// xmm2 = rb00
+			// xmm3 = rb01
+			// xmm6 = ga00
+			// xmm4 = ga01
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm7 = used
+
+			// rb00 = rb00.lerp16<0>(rb01, uf);
+			// ga00 = ga00.lerp16<0>(ga01, uf);
+
+			lerp16<0>(xmm3, xmm2, xmm0);
+			lerp16<0>(xmm4, xmm6, xmm0);
+
+			// xmm0 = uf
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm2, xmm6 = free
+			// xmm7 = used
+
+			// GSVector4i rb10 = c10 & mask;
+			// GSVector4i ga10 = (c10 >> 8) & mask;
+
+			vpsrlw(xmm2, xmm1, 8);
+			vpsllw(xmm1, 8);
+			vpsrlw(xmm1, 8);
+
+			// GSVector4i rb11 = c11 & mask;
+			// GSVector4i ga11 = (c11 >> 8) & mask;
+
+			vpsrlw(xmm6, xmm5, 8);
+			vpsllw(xmm5, 8);
+			vpsrlw(xmm5, 8);
+
+			// xmm0 = uf
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm1 = rb10
+			// xmm5 = rb11
+			// xmm2 = ga10
+			// xmm6 = ga11
+			// xmm7 = used
+
+			// rb10 = rb10.lerp16<0>(rb11, uf);
+			// ga10 = ga10.lerp16<0>(ga11, uf);
+
+			lerp16<0>(xmm5, xmm1, xmm0);
+			lerp16<0>(xmm6, xmm2, xmm0);
+
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm5 = rb10
+			// xmm6 = ga10
+			// xmm0, xmm1, xmm2 = free
+			// xmm7 = used
+
+			// rb00 = rb00.lerp16<0>(rb10, vf);
+			// ga00 = ga00.lerp16<0>(ga10, vf);
+
+			vmovdqa(xmm0, ptr[&m_env.temp.vf]);
+
+			lerp16<0>(xmm5, xmm3, xmm0);
+			lerp16<0>(xmm6, xmm4, xmm0);
+		}
+		else
+		{
+			// GSVector4i addr00 = y0 + x0;
+
+			vpaddd(xmm2, xmm4);
+
+			// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+			ReadTexel(xmm5, xmm2, xmm0, xmm1);
+
+			// GSVector4i mask = GSVector4i::x00ff();
+
+			// c[0] = c00 & mask;
+			// c[1] = (c00 >> 8) & mask;
+
+			vpsrlw(xmm6, xmm5, 8);
+			vpsllw(xmm5, 8);
+			vpsrlw(xmm5, 8);
 		}
 	}
-
-	// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
-
-	psrad(xmm2, 16);
-	psrad(xmm3, 16);
-	packssdw(xmm2, xmm3);
-
-	if(m_sel.ltf)
-	{
-		// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
-
-		movdqa(xmm3, xmm2);
-		pcmpeqd(xmm1, xmm1);
-		psrlw(xmm1, 15);
-		paddw(xmm3, xmm1);
-
-		// uv0 = Wrap(uv0);
-		// uv1 = Wrap(uv1);
-
-		Wrap(xmm2, xmm3);
-	}
 	else
 	{
-		// uv0 = Wrap(uv0);
+		if(!m_sel.fst)
+		{
+			// TODO: move these into Init/Step too?
 
-		Wrap(xmm2);
-	}
+			cvttps2dq(xmm2, xmm2);
+			cvttps2dq(xmm3, xmm3);
 
-	// xmm2 = uv0
-	// xmm3 = uv1 (ltf)
-	// xmm0, xmm1, xmm4, xmm5, xmm6 = free
-	// xmm7 = used
+			if(m_sel.ltf)
+			{
+				// u -= 0x8000;
+				// v -= 0x8000;
 
-	// GSVector4i y0 = uv0.uph16() << tw;
-	// GSVector4i x0 = uv0.upl16();
+				mov(eax, 0x8000);
+				movd(xmm4, eax);
+				pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+				psubd(xmm2, xmm4);
+				psubd(xmm3, xmm4);
+			}
+		}
 
-	pxor(xmm0, xmm0);
-	movd(xmm1, ptr[&m_env.tw]);
+		// xmm2 = u
+		// xmm3 = v
 
-	movdqa(xmm4, xmm2);
-	punpckhwd(xmm2, xmm0);
-	punpcklwd(xmm4, xmm0);
-	pslld(xmm2, xmm1);
+		if(m_sel.ltf)
+		{
+			// GSVector4i uf = u.xxzzlh().srl16(1);
 
-	// xmm0 = 0
-	// xmm1 = tw
-	// xmm2 = y0
-	// xmm3 = uv1 (ltf)
-	// xmm4 = x0
-	// xmm5, xmm6 = free
-	// xmm7 = used
+			movdqa(xmm0, xmm2);
+			pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			psrlw(xmm0, 1);
+			movdqa(ptr[&m_env.temp.uf], xmm0);
 
-	if(m_sel.ltf)
-	{
-		// GSVector4i y1 = uv1.uph16() << tw;
-		// GSVector4i x1 = uv1.upl16();
+			if(!m_sel.sprite)
+			{
+				// GSVector4i vf = v.xxzzlh().srl16(1);
 
-		movdqa(xmm6, xmm3);
-		punpckhwd(xmm3, xmm0);
-		punpcklwd(xmm6, xmm0);
-		pslld(xmm3, xmm1);
+				movdqa(xmm1, xmm3);
+				pshuflw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0));
+				pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0));
+				psrlw(xmm1, 1);
+				movdqa(ptr[&m_env.temp.vf], xmm1);
+			}
+		}
 
+		// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+		psrad(xmm2, 16);
+		psrad(xmm3, 16);
+		packssdw(xmm2, xmm3);
+
+		if(m_sel.ltf)
+		{
+			// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+			movdqa(xmm3, xmm2);
+			pcmpeqd(xmm1, xmm1);
+			psrlw(xmm1, 15);
+			paddw(xmm3, xmm1);
+
+			// uv0 = Wrap(uv0);
+			// uv1 = Wrap(uv1);
+
+			Wrap(xmm2, xmm3);
+		}
+		else
+		{
+			// uv0 = Wrap(uv0);
+
+			Wrap(xmm2);
+		}
+
+		// xmm2 = uv0
+		// xmm3 = uv1 (ltf)
+		// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i y0 = uv0.uph16() << tw;
+		// GSVector4i x0 = uv0.upl16();
+
+		pxor(xmm0, xmm0);
+		movd(xmm1, ptr[&m_env.tw]);
+
+		movdqa(xmm4, xmm2);
+		punpckhwd(xmm2, xmm0);
+		punpcklwd(xmm4, xmm0);
+		pslld(xmm2, xmm1);
+
+		// xmm0 = 0
+		// xmm1 = tw
 		// xmm2 = y0
-		// xmm3 = y1
+		// xmm3 = uv1 (ltf)
 		// xmm4 = x0
-		// xmm6 = x1
-		// xmm0, xmm5, xmm6 = free
+		// xmm5, xmm6 = free
 		// xmm7 = used
 
-		// GSVector4i addr00 = y0 + x0;
-		// GSVector4i addr01 = y0 + x1;
-		// GSVector4i addr10 = y1 + x0;
-		// GSVector4i addr11 = y1 + x1;
+		if(m_sel.ltf)
+		{
+			// GSVector4i y1 = uv1.uph16() << tw;
+			// GSVector4i x1 = uv1.upl16();
 
-		movdqa(xmm5, xmm2);
-		paddd(xmm5, xmm4);
-		paddd(xmm2, xmm6);
+			movdqa(xmm6, xmm3);
+			punpckhwd(xmm3, xmm0);
+			punpcklwd(xmm6, xmm0);
+			pslld(xmm3, xmm1);
 
-		movdqa(xmm0, xmm3);
-		paddd(xmm0, xmm4);
-		paddd(xmm3, xmm6);
+			// xmm2 = y0
+			// xmm3 = y1
+			// xmm4 = x0
+			// xmm6 = x1
+			// xmm0, xmm5, xmm6 = free
+			// xmm7 = used
 
-		// xmm5 = addr00
-		// xmm2 = addr01
-		// xmm0 = addr10
-		// xmm3 = addr11
-		// xmm1, xmm4, xmm6 = free
-		// xmm7 = used
+			// GSVector4i addr00 = y0 + x0;
+			// GSVector4i addr01 = y0 + x1;
+			// GSVector4i addr10 = y1 + x0;
+			// GSVector4i addr11 = y1 + x1;
 
-		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
-		// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
-		// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
-		// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+			movdqa(xmm5, xmm2);
+			paddd(xmm5, xmm4);
+			paddd(xmm2, xmm6);
 
-		ReadTexel(xmm6, xmm5, xmm1, xmm4);
+			movdqa(xmm0, xmm3);
+			paddd(xmm0, xmm4);
+			paddd(xmm3, xmm6);
 
-		// xmm2, xmm5, xmm1 = free
+			// xmm5 = addr00
+			// xmm2 = addr01
+			// xmm0 = addr10
+			// xmm3 = addr11
+			// xmm1, xmm4, xmm6 = free
+			// xmm7 = used
 
-		ReadTexel(xmm4, xmm2, xmm5, xmm1);
+			// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
 
-		// xmm0, xmm2, xmm5 = free
+			ReadTexel(xmm6, xmm5, xmm1, xmm4);
 
-		ReadTexel(xmm1, xmm0, xmm2, xmm5);
+			// xmm2, xmm5, xmm1 = free
 
-		// xmm3, xmm0, xmm2 = free
+			ReadTexel(xmm4, xmm2, xmm5, xmm1);
 
-		ReadTexel(xmm5, xmm3, xmm0, xmm2);
+			// xmm0, xmm2, xmm5 = free
 
-		// xmm6 = c00
-		// xmm4 = c01
-		// xmm1 = c10
-		// xmm5 = c11
-		// xmm0, xmm2, xmm3 = free
-		// xmm7 = used
+			ReadTexel(xmm1, xmm0, xmm2, xmm5);
 
-		movdqa(xmm0, xmmword[&m_env.temp.uf]);
+			// xmm3, xmm0, xmm2 = free
 
-		// GSVector4i rb00 = c00 & mask;
-		// GSVector4i ga00 = (c00 >> 8) & mask;
+			ReadTexel(xmm5, xmm3, xmm0, xmm2);
 
-		movdqa(xmm2, xmm6);
-		psllw(xmm2, 8);
-		psrlw(xmm2, 8);
-		psrlw(xmm6, 8);
+			// xmm6 = c00
+			// xmm4 = c01
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm0, xmm2, xmm3 = free
+			// xmm7 = used
 
-		// GSVector4i rb01 = c01 & mask;
-		// GSVector4i ga01 = (c01 >> 8) & mask;
+			movdqa(xmm0, ptr[&m_env.temp.uf]);
 
-		movdqa(xmm3, xmm4);
-		psllw(xmm3, 8);
-		psrlw(xmm3, 8);
-		psrlw(xmm4, 8);
+			// GSVector4i rb00 = c00 & mask;
+			// GSVector4i ga00 = (c00 >> 8) & mask;
 
-		// xmm0 = uf
-		// xmm2 = rb00
-		// xmm3 = rb01
-		// xmm6 = ga00
-		// xmm4 = ga01
-		// xmm1 = c10
-		// xmm5 = c11
-		// xmm7 = used
+			movdqa(xmm2, xmm6);
+			psllw(xmm2, 8);
+			psrlw(xmm2, 8);
+			psrlw(xmm6, 8);
 
-		// rb00 = rb00.lerp16<0>(rb01, uf);
-		// ga00 = ga00.lerp16<0>(ga01, uf);
+			// GSVector4i rb01 = c01 & mask;
+			// GSVector4i ga01 = (c01 >> 8) & mask;
 
-		lerp16<0>(xmm3, xmm2, xmm0);
-		lerp16<0>(xmm4, xmm6, xmm0);
+			movdqa(xmm3, xmm4);
+			psllw(xmm3, 8);
+			psrlw(xmm3, 8);
+			psrlw(xmm4, 8);
 
-		// xmm0 = uf
-		// xmm3 = rb00
-		// xmm4 = ga00
-		// xmm1 = c10
-		// xmm5 = c11
-		// xmm2, xmm6 = free
-		// xmm7 = used
+			// xmm0 = uf
+			// xmm2 = rb00
+			// xmm3 = rb01
+			// xmm6 = ga00
+			// xmm4 = ga01
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm7 = used
 
-		// GSVector4i rb10 = c10 & mask;
-		// GSVector4i ga10 = (c10 >> 8) & mask;
+			// rb00 = rb00.lerp16<0>(rb01, uf);
+			// ga00 = ga00.lerp16<0>(ga01, uf);
 
-		movdqa(xmm2, xmm1);
-		psllw(xmm1, 8);
-		psrlw(xmm1, 8);
-		psrlw(xmm2, 8);
+			lerp16<0>(xmm3, xmm2, xmm0);
+			lerp16<0>(xmm4, xmm6, xmm0);
 
-		// GSVector4i rb11 = c11 & mask;
-		// GSVector4i ga11 = (c11 >> 8) & mask;
+			// xmm0 = uf
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm2, xmm6 = free
+			// xmm7 = used
 
-		movdqa(xmm6, xmm5);
-		psllw(xmm5, 8);
-		psrlw(xmm5, 8);
-		psrlw(xmm6, 8);
+			// GSVector4i rb10 = c10 & mask;
+			// GSVector4i ga10 = (c10 >> 8) & mask;
 
-		// xmm0 = uf
-		// xmm3 = rb00
-		// xmm4 = ga00
-		// xmm1 = rb10
-		// xmm5 = rb11
-		// xmm2 = ga10
-		// xmm6 = ga11
-		// xmm7 = used
+			movdqa(xmm2, xmm1);
+			psllw(xmm1, 8);
+			psrlw(xmm1, 8);
+			psrlw(xmm2, 8);
 
-		// rb10 = rb10.lerp16<0>(rb11, uf);
-		// ga10 = ga10.lerp16<0>(ga11, uf);
+			// GSVector4i rb11 = c11 & mask;
+			// GSVector4i ga11 = (c11 >> 8) & mask;
 
-		lerp16<0>(xmm5, xmm1, xmm0);
-		lerp16<0>(xmm6, xmm2, xmm0);
+			movdqa(xmm6, xmm5);
+			psllw(xmm5, 8);
+			psrlw(xmm5, 8);
+			psrlw(xmm6, 8);
 
-		// xmm3 = rb00
-		// xmm4 = ga00
-		// xmm5 = rb10
-		// xmm6 = ga10
-		// xmm0, xmm1, xmm2 = free
-		// xmm7 = used
+			// xmm0 = uf
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm1 = rb10
+			// xmm5 = rb11
+			// xmm2 = ga10
+			// xmm6 = ga11
+			// xmm7 = used
 
-		// rb00 = rb00.lerp16<0>(rb10, vf);
-		// ga00 = ga00.lerp16<0>(ga10, vf);
+			// rb10 = rb10.lerp16<0>(rb11, uf);
+			// ga10 = ga10.lerp16<0>(ga11, uf);
 
-		movdqa(xmm0, xmmword[&m_env.temp.vf]);
+			lerp16<0>(xmm5, xmm1, xmm0);
+			lerp16<0>(xmm6, xmm2, xmm0);
 
-		lerp16<0>(xmm5, xmm3, xmm0);
-		lerp16<0>(xmm6, xmm4, xmm0);
-	}
-	else
-	{
-		// GSVector4i addr00 = y0 + x0;
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm5 = rb10
+			// xmm6 = ga10
+			// xmm0, xmm1, xmm2 = free
+			// xmm7 = used
 
-		paddd(xmm2, xmm4);
+			// rb00 = rb00.lerp16<0>(rb10, vf);
+			// ga00 = ga00.lerp16<0>(ga10, vf);
 
-		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+			movdqa(xmm0, ptr[&m_env.temp.vf]);
 
-		ReadTexel(xmm5, xmm2, xmm0, xmm1);
+			lerp16<0>(xmm5, xmm3, xmm0);
+			lerp16<0>(xmm6, xmm4, xmm0);
+		}
+		else
+		{
+			// GSVector4i addr00 = y0 + x0;
 
-		// GSVector4i mask = GSVector4i::x00ff();
+			paddd(xmm2, xmm4);
 
-		// c[0] = c00 & mask;
-		// c[1] = (c00 >> 8) & mask;
+			// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
 
-		movdqa(xmm6, xmm5);
+			ReadTexel(xmm5, xmm2, xmm0, xmm1);
 
-		psllw(xmm5, 8);
-		psrlw(xmm5, 8);
-		psrlw(xmm6, 8);
+			// GSVector4i mask = GSVector4i::x00ff();
+
+			// c[0] = c00 & mask;
+			// c[1] = (c00 >> 8) & mask;
+
+			movdqa(xmm6, xmm5);
+
+			psllw(xmm5, 8);
+			psrlw(xmm5, 8);
+			psrlw(xmm6, 8);
+		}
 	}
 }
 
@@ -1014,57 +1745,116 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
 
 	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
 
-	if(wms_clamp == wmt_clamp)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		if(wms_clamp)
+		if(wms_clamp == wmt_clamp)
 		{
-			if(region)
+			if(wms_clamp)
 			{
-				pmaxsw(uv, xmmword[&m_env.t.min]);
+				if(region)
+				{
+					vpmaxsw(uv, ptr[&m_env.t.min]);
+				}
+				else
+				{
+					vpxor(xmm0, xmm0);
+					vpmaxsw(uv, xmm0);
+				}
+
+				vpminsw(uv, ptr[&m_env.t.max]);
 			}
 			else
 			{
-				pxor(xmm0, xmm0);
-				pmaxsw(uv, xmm0);
-			}
+				vpand(uv, ptr[&m_env.t.min]);
 
-			pminsw(uv, xmmword[&m_env.t.max]);
+				if(region)
+				{
+					vpor(uv, ptr[&m_env.t.max]);
+				}
+			}
 		}
 		else
 		{
-			pand(uv, xmmword[&m_env.t.min]);
+			vmovdqa(xmm1, uv);
+
+			vmovdqa(xmm4, ptr[&m_env.t.min]);
+			vmovdqa(xmm5, ptr[&m_env.t.max]);
+
+			// GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max);
+
+			vpmaxsw(uv, xmm4);
+			vpminsw(uv, xmm5);
+
+			// GSVector4i repeat = (t & m_env.t.min) | m_env.t.max;
+
+			vpand(xmm1, xmm4);
 
 			if(region)
 			{
-				por(uv, xmmword[&m_env.t.max]);
+				vpor(xmm1, xmm5);
 			}
+
+			// clamp.blend8(repeat, m_env.t.mask);
+
+			vmovdqa(xmm0, ptr[&m_env.t.mask]);
+
+			vpblendvb(uv, xmm1, xmm0);
 		}
 	}
 	else
 	{
-		movdqa(xmm1, uv);
-
-		movdqa(xmm4, xmmword[&m_env.t.min]);
-		movdqa(xmm5, xmmword[&m_env.t.max]);
-
-		// GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max);
-
-		pmaxsw(uv, xmm4);
-		pminsw(uv, xmm5);
-
-		// GSVector4i repeat = (t & m_env.t.min) | m_env.t.max;
-
-		pand(xmm1, xmm4);
-
-		if(region)
+		if(wms_clamp == wmt_clamp)
 		{
-			por(xmm1, xmm5);
+			if(wms_clamp)
+			{
+				if(region)
+				{
+					pmaxsw(uv, ptr[&m_env.t.min]);
+				}
+				else
+				{
+					pxor(xmm0, xmm0);
+					pmaxsw(uv, xmm0);
+				}
+
+				pminsw(uv, ptr[&m_env.t.max]);
+			}
+			else
+			{
+				pand(uv, ptr[&m_env.t.min]);
+
+				if(region)
+				{
+					por(uv, ptr[&m_env.t.max]);
+				}
+			}
 		}
+		else
+		{
+			movdqa(xmm1, uv);
 
-		// clamp.blend8(repeat, m_env.t.mask);
+			movdqa(xmm4, ptr[&m_env.t.min]);
+			movdqa(xmm5, ptr[&m_env.t.max]);
 
-		movdqa(xmm0, xmmword[&m_env.t.mask]);
-		blend8(uv, xmm1);
+			// GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max);
+
+			pmaxsw(uv, xmm4);
+			pminsw(uv, xmm5);
+
+			// GSVector4i repeat = (t & m_env.t.min) | m_env.t.max;
+
+			pand(xmm1, xmm4);
+
+			if(region)
+			{
+				por(xmm1, xmm5);
+			}
+
+			// clamp.blend8(repeat, m_env.t.mask);
+
+			movdqa(xmm0, ptr[&m_env.t.mask]);
+			blend8(uv, xmm1);
+		}
 	}
 }
 
@@ -1077,92 +1867,187 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
 
 	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
 
-	if(wms_clamp == wmt_clamp)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		if(wms_clamp)
+		if(wms_clamp == wmt_clamp)
 		{
-			if(region)
+			if(wms_clamp)
 			{
-				movdqa(xmm4, xmmword[&m_env.t.min]);
+				if(region)
+				{
+					vmovdqa(xmm4, ptr[&m_env.t.min]);
 
-				pmaxsw(uv0, xmm4);
-				pmaxsw(uv1, xmm4);
+					vpmaxsw(uv0, xmm4);
+					vpmaxsw(uv1, xmm4);
+				}
+				else
+				{
+					vpxor(xmm0, xmm0);
+
+					vpmaxsw(uv0, xmm0);
+					vpmaxsw(uv1, xmm0);
+				}
+
+				vmovdqa(xmm5, ptr[&m_env.t.max]);
+
+				vpminsw(uv0, xmm5);
+				vpminsw(uv1, xmm5);
 			}
 			else
 			{
-				pxor(xmm0, xmm0);
-				pmaxsw(uv0, xmm0);
-				pmaxsw(uv1, xmm0);
+				vmovdqa(xmm4, ptr[&m_env.t.min]);
+
+				vpand(uv0, xmm4);
+				vpand(uv1, xmm4);
+
+				if(region)
+				{
+					vmovdqa(xmm5, ptr[&m_env.t.max]);
+
+					vpor(uv0, xmm5);
+					vpor(uv1, xmm5);
+				}
 			}
-
-			movdqa(xmm5, xmmword[&m_env.t.max]);
-
-			pminsw(uv0, xmm5);
-			pminsw(uv1, xmm5);
 		}
 		else
 		{
-			movdqa(xmm4, xmmword[&m_env.t.min]);
+			vmovdqa(xmm1, uv0);
+			vmovdqa(xmm6, uv1);
 
-			pand(uv0, xmm4);
-			pand(uv1, xmm4);
+			vmovdqa(xmm4, ptr[&m_env.t.min]);
+			vmovdqa(xmm5, ptr[&m_env.t.max]);
+
+			// GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max);
+
+			vpmaxsw(uv0, xmm4);
+			vpmaxsw(uv1, xmm4);
+			vpminsw(uv0, xmm5);
+			vpminsw(uv1, xmm5);
+
+			// GSVector4i repeat = (t & m_env.t.min) | m_env.t.max;
+
+			vpand(xmm1, xmm4);
+			vpand(xmm6, xmm4);
 
 			if(region)
 			{
-				movdqa(xmm5, xmmword[&m_env.t.max]);
+				vpor(xmm1, xmm5);
+				vpor(xmm6, xmm5);
+			}
 
-				por(uv0, xmm5);
-				por(uv1, xmm5);
+			// clamp.blend8(repeat, m_env.t.mask);
+
+			if(m_cpu.has(util::Cpu::tSSE41))
+			{
+				vmovdqa(xmm0, ptr[&m_env.t.mask]);
+
+				vpblendvb(uv0, xmm1, xmm0);
+				vpblendvb(uv1, xmm6, xmm0);
+			}
+			else
+			{
+				vmovdqa(xmm0, ptr[&m_env.t.invmask]);
+				vmovdqa(xmm4, xmm0);
+
+				vpand(uv0, xmm0);
+				vpandn(xmm0, xmm1);
+				vpor(uv0, xmm0);
+
+				vpand(uv1, xmm4);
+				vpandn(xmm4, xmm6);
+				vpor(uv1, xmm4);
 			}
 		}
 	}
 	else
 	{
-		movdqa(xmm1, uv0);
-		movdqa(xmm6, uv1);
-
-		movdqa(xmm4, xmmword[&m_env.t.min]);
-		movdqa(xmm5, xmmword[&m_env.t.max]);
-
-		// GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max);
-
-		pmaxsw(uv0, xmm4);
-		pmaxsw(uv1, xmm4);
-		pminsw(uv0, xmm5);
-		pminsw(uv1, xmm5);
-
-		// GSVector4i repeat = (t & m_env.t.min) | m_env.t.max;
-
-		pand(xmm1, xmm4);
-		pand(xmm6, xmm4);
-
-		if(region)
+		if(wms_clamp == wmt_clamp)
 		{
-			por(xmm1, xmm5);
-			por(xmm6, xmm5);
-		}
+			if(wms_clamp)
+			{
+				if(region)
+				{
+					movdqa(xmm4, ptr[&m_env.t.min]);
 
-		// clamp.blend8(repeat, m_env.t.mask);
+					pmaxsw(uv0, xmm4);
+					pmaxsw(uv1, xmm4);
+				}
+				else
+				{
+					pxor(xmm0, xmm0);
+					pmaxsw(uv0, xmm0);
+					pmaxsw(uv1, xmm0);
+				}
 
-		if(m_cpu.has(util::Cpu::tSSE41))
-		{
-			movdqa(xmm0, xmmword[&m_env.t.mask]);
+				movdqa(xmm5, ptr[&m_env.t.max]);
 
-			pblendvb(uv0, xmm1);
-			pblendvb(uv1, xmm6);
+				pminsw(uv0, xmm5);
+				pminsw(uv1, xmm5);
+			}
+			else
+			{
+				movdqa(xmm4, ptr[&m_env.t.min]);
+
+				pand(uv0, xmm4);
+				pand(uv1, xmm4);
+
+				if(region)
+				{
+					movdqa(xmm5, ptr[&m_env.t.max]);
+
+					por(uv0, xmm5);
+					por(uv1, xmm5);
+				}
+			}
 		}
 		else
 		{
-			movdqa(xmm0, xmmword[&m_env.t.invmask]);
-			movdqa(xmm4, xmm0);
+			movdqa(xmm1, uv0);
+			movdqa(xmm6, uv1);
 
-			pand(uv0, xmm0);
-			pandn(xmm0, xmm1);
-			por(uv0, xmm0);
+			movdqa(xmm4, ptr[&m_env.t.min]);
+			movdqa(xmm5, ptr[&m_env.t.max]);
 
-			pand(uv1, xmm4);
-			pandn(xmm4, xmm6);
-			por(uv1, xmm4);
+			// GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max);
+
+			pmaxsw(uv0, xmm4);
+			pmaxsw(uv1, xmm4);
+			pminsw(uv0, xmm5);
+			pminsw(uv1, xmm5);
+
+			// GSVector4i repeat = (t & m_env.t.min) | m_env.t.max;
+
+			pand(xmm1, xmm4);
+			pand(xmm6, xmm4);
+
+			if(region)
+			{
+				por(xmm1, xmm5);
+				por(xmm6, xmm5);
+			}
+
+			// clamp.blend8(repeat, m_env.t.mask);
+
+			if(m_cpu.has(util::Cpu::tSSE41))
+			{
+				movdqa(xmm0, ptr[&m_env.t.mask]);
+
+				pblendvb(uv0, xmm1);
+				pblendvb(uv1, xmm6);
+			}
+			else
+			{
+				movdqa(xmm0, ptr[&m_env.t.invmask]);
+				movdqa(xmm4, xmm0);
+
+				pand(uv0, xmm0);
+				pandn(xmm0, xmm1);
+				por(uv0, xmm0);
+
+				pand(uv1, xmm4);
+				pandn(xmm4, xmm6);
+				por(uv1, xmm4);
+			}
 		}
 	}
 }
@@ -1174,143 +2059,288 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
 		return;
 	}
 
-	switch(m_sel.tfx)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-	case TFX_MODULATE:
-
-		// GSVector4i ga = iip ? gaf : m_env.c.ga;
-
-		movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
-
-		// gat = gat.modulate16<1>(ga).clamp8();
-
-		modulate16<1>(xmm6, xmm4);
-
-		clamp16(xmm6, xmm3);
-
-		// if(!tcc) gat = gat.mix16(ga.srl16(7));
-
-		if(!m_sel.tcc)
+		switch(m_sel.tfx)
 		{
-			psrlw(xmm4, 7);
+		case TFX_MODULATE:
 
-			mix16(xmm6, xmm4, xmm3);
-		}
-
-		break;
-
-	case TFX_DECAL:
-
-		// if(!tcc) gat = gat.mix16(ga.srl16(7));
-
-		if(!m_sel.tcc)
-		{
 			// GSVector4i ga = iip ? gaf : m_env.c.ga;
 
-			movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+			vmovdqa(xmm4, ptr[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
 
-			psrlw(xmm4, 7);
+			// gat = gat.modulate16<1>(ga).clamp8();
 
-			mix16(xmm6, xmm4, xmm3);
-		}
+			modulate16<1>(xmm6, xmm4);
 
-		break;
+			clamp16(xmm6, xmm3);
 
-	case TFX_HIGHLIGHT:
+			// if(!tcc) gat = gat.mix16(ga.srl16(7));
 
-		// GSVector4i ga = iip ? gaf : m_env.c.ga;
-
-		movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
-		movdqa(xmm2, xmm4);
-
-		// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
-
-		psrlw(xmm4, 7);
-
-		if(m_sel.tcc)
-		{
-			paddusb(xmm4, xmm6);
-		}
-
-		mix16(xmm6, xmm4, xmm3);
-
-		break;
-
-	case TFX_HIGHLIGHT2:
-
-		// if(!tcc) gat = gat.mix16(ga.srl16(7));
-
-		if(!m_sel.tcc)
-		{
-			// GSVector4i ga = iip ? gaf : m_env.c.ga;
-
-			movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
-			movdqa(xmm2, xmm4);
-
-			psrlw(xmm4, 7);
-
-			mix16(xmm6, xmm4, xmm3);
-		}
-
-		break;
-
-	case TFX_NONE:
-
-		// gat = iip ? ga.srl16(7) : ga;
-
-		if(m_sel.iip)
-		{
-			psrlw(xmm6, 7);
-		}
-
-		break;
-	}
-
-	if(m_sel.aa1)
-	{
-		// gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha
-
-		// FIXME: bios config screen cubes
-
-		if(!m_sel.abe)
-		{
-			// a = cov
-
-			if(m_sel.edge)
+			if(!m_sel.tcc)
 			{
-				movdqa(xmm0, xmmword[&m_env.temp.cov]);
+				vpsrlw(xmm4, 7);
+
+				mix16(xmm6, xmm4, xmm3);
+			}
+
+			break;
+
+		case TFX_DECAL:
+
+			// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+			if(!m_sel.tcc)
+			{
+				// GSVector4i ga = iip ? gaf : m_env.c.ga;
+
+				vmovdqa(xmm4, ptr[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+
+				vpsrlw(xmm4, 7);
+
+				mix16(xmm6, xmm4, xmm3);
+			}
+
+			break;
+
+		case TFX_HIGHLIGHT:
+
+			// GSVector4i ga = iip ? gaf : m_env.c.ga;
+
+			vmovdqa(xmm4, ptr[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+			vmovdqa(xmm2, xmm4);
+
+			// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
+
+			vpsrlw(xmm4, 7);
+
+			if(m_sel.tcc)
+			{
+				vpaddusb(xmm4, xmm6);
+			}
+
+			mix16(xmm6, xmm4, xmm3);
+
+			break;
+
+		case TFX_HIGHLIGHT2:
+
+			// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+			if(!m_sel.tcc)
+			{
+				// GSVector4i ga = iip ? gaf : m_env.c.ga;
+
+				vmovdqa(xmm4, ptr[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+				vmovdqa(xmm2, xmm4);
+
+				vpsrlw(xmm4, 7);
+
+				mix16(xmm6, xmm4, xmm3);
+			}
+
+			break;
+
+		case TFX_NONE:
+
+			// gat = iip ? ga.srl16(7) : ga;
+
+			if(m_sel.iip)
+			{
+				vpsrlw(xmm6, 7);
+			}
+
+			break;
+		}
+
+		if(m_sel.aa1)
+		{
+			// gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha
+
+			// FIXME: bios config screen cubes
+
+			if(!m_sel.abe)
+			{
+				// a = cov
+
+				if(m_sel.edge)
+				{
+					vmovdqa(xmm0, ptr[&m_env.temp.cov]);
+				}
+				else
+				{
+					vpcmpeqd(xmm0, xmm0);
+					vpsllw(xmm0, 15);
+					vpsrlw(xmm0, 8);
+				}
+
+				mix16(xmm6, xmm0, xmm1);
 			}
 			else
 			{
+				// a = a == 0x80 ? cov : a
+
+				vpcmpeqd(xmm0, xmm0);
+				vpsllw(xmm0, 15);
+				vpsrlw(xmm0, 8);
+
+				if(m_sel.edge)
+				{
+					vmovdqa(xmm1, ptr[&m_env.temp.cov]);
+				}
+				else
+				{
+					vmovdqa(xmm1, xmm0);
+				}
+
+				vpcmpeqw(xmm0, xmm6);
+				vpsrld(xmm0, 16);
+				vpslld(xmm0, 16);
+
+				vpblendvb(xmm6, xmm1, xmm0);
+			}
+		}
+	}
+	else
+	{
+		switch(m_sel.tfx)
+		{
+		case TFX_MODULATE:
+
+			// GSVector4i ga = iip ? gaf : m_env.c.ga;
+
+			movdqa(xmm4, ptr[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+
+			// gat = gat.modulate16<1>(ga).clamp8();
+
+			modulate16<1>(xmm6, xmm4);
+
+			clamp16(xmm6, xmm3);
+
+			// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+			if(!m_sel.tcc)
+			{
+				psrlw(xmm4, 7);
+
+				mix16(xmm6, xmm4, xmm3);
+			}
+
+			break;
+
+		case TFX_DECAL:
+
+			// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+			if(!m_sel.tcc)
+			{
+				// GSVector4i ga = iip ? gaf : m_env.c.ga;
+
+				movdqa(xmm4, ptr[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+
+				psrlw(xmm4, 7);
+
+				mix16(xmm6, xmm4, xmm3);
+			}
+
+			break;
+
+		case TFX_HIGHLIGHT:
+
+			// GSVector4i ga = iip ? gaf : m_env.c.ga;
+
+			movdqa(xmm4, ptr[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+			movdqa(xmm2, xmm4);
+
+			// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
+
+			psrlw(xmm4, 7);
+
+			if(m_sel.tcc)
+			{
+				paddusb(xmm4, xmm6);
+			}
+
+			mix16(xmm6, xmm4, xmm3);
+
+			break;
+
+		case TFX_HIGHLIGHT2:
+
+			// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+			if(!m_sel.tcc)
+			{
+				// GSVector4i ga = iip ? gaf : m_env.c.ga;
+
+				movdqa(xmm4, ptr[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+				movdqa(xmm2, xmm4);
+
+				psrlw(xmm4, 7);
+
+				mix16(xmm6, xmm4, xmm3);
+			}
+
+			break;
+
+		case TFX_NONE:
+
+			// gat = iip ? ga.srl16(7) : ga;
+
+			if(m_sel.iip)
+			{
+				psrlw(xmm6, 7);
+			}
+
+			break;
+		}
+
+		if(m_sel.aa1)
+		{
+			// gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha
+
+			// FIXME: bios config screen cubes
+
+			if(!m_sel.abe)
+			{
+				// a = cov
+
+				if(m_sel.edge)
+				{
+					movdqa(xmm0, ptr[&m_env.temp.cov]);
+				}
+				else
+				{
+					pcmpeqd(xmm0, xmm0);
+					psllw(xmm0, 15);
+					psrlw(xmm0, 8);
+				}
+
+				mix16(xmm6, xmm0, xmm1);
+			}
+			else
+			{
+				// a = a == 0x80 ? cov : a
+
 				pcmpeqd(xmm0, xmm0);
 				psllw(xmm0, 15);
 				psrlw(xmm0, 8);
+
+				if(m_sel.edge)
+				{
+					movdqa(xmm1, ptr[&m_env.temp.cov]);
+				}
+				else
+				{
+					movdqa(xmm1, xmm0);
+				}
+
+				pcmpeqw(xmm0, xmm6);
+				psrld(xmm0, 16);
+				pslld(xmm0, 16);
+
+				blend8(xmm6, xmm1);
 			}
-
-			mix16(xmm6, xmm0, xmm1);
-		}
-		else
-		{
-			// a = a == 0x80 ? cov : a
-
-			pcmpeqd(xmm0, xmm0);
-			psllw(xmm0, 15);
-			psrlw(xmm0, 8);
-
-			if(m_sel.edge)
-			{
-				movdqa(xmm1, xmmword[&m_env.temp.cov]);
-			}
-			else
-			{
-				movdqa(xmm1, xmm0);
-			}
-
-			pcmpeqw(xmm0, xmm6);
-			psrld(xmm0, 16);
-			pslld(xmm0, 16);
-
-			blend8(xmm6, xmm1);
 		}
 	}
 }
@@ -1332,76 +2362,149 @@ void GSDrawScanlineCodeGenerator::TestAlpha()
 		break;
 	}
 
-	switch(m_sel.atst)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-	case ATST_NEVER:
-		// t = GSVector4i::xffffffff();
-		pcmpeqd(xmm1, xmm1);
-		break;
+		switch(m_sel.atst)
+		{
+		case ATST_NEVER:
+			// t = GSVector4i::xffffffff();
+			vpcmpeqd(xmm1, xmm1);
+			break;
 
-	case ATST_ALWAYS:
-		return;
+		case ATST_ALWAYS:
+			return;
 
-	case ATST_LESS:
-	case ATST_LEQUAL:
-		// t = (ga >> 16) > m_env.aref;
-		movdqa(xmm1, xmm6);
-		psrld(xmm1, 16);
-		pcmpgtd(xmm1, xmmword[&m_env.aref]);
-		break;
+		case ATST_LESS:
+		case ATST_LEQUAL:
+			// t = (ga >> 16) > m_env.aref;
+			vpsrld(xmm1, xmm6, 16);
+			vpcmpgtd(xmm1, ptr[&m_env.aref]);
+			break;
 
-	case ATST_EQUAL:
-		// t = (ga >> 16) != m_env.aref;
-		movdqa(xmm1, xmm6);
-		psrld(xmm1, 16);
-		pcmpeqd(xmm1, xmmword[&m_env.aref]);
-		pcmpeqd(xmm0, xmm0);
-		pxor(xmm1, xmm0);
-		break;
+		case ATST_EQUAL:
+			// t = (ga >> 16) != m_env.aref;
+			vpsrld(xmm1, xmm6, 16);
+			vpcmpeqd(xmm1, ptr[&m_env.aref]);
+			vpcmpeqd(xmm0, xmm0);
+			vpxor(xmm1, xmm0);
+			break;
 
-	case ATST_GEQUAL:
-	case ATST_GREATER:
-		// t = (ga >> 16) < m_env.aref;
-		movdqa(xmm0, xmm6);
-		psrld(xmm0, 16);
-		movdqa(xmm1, xmmword[&m_env.aref]);
-		pcmpgtd(xmm1, xmm0);
-		break;
+		case ATST_GEQUAL:
+		case ATST_GREATER:
+			// t = (ga >> 16) < m_env.aref;
+			vpsrld(xmm0, xmm6, 16);
+			vmovdqa(xmm1, ptr[&m_env.aref]);
+			vpcmpgtd(xmm1, xmm0);
+			break;
 
-	case ATST_NOTEQUAL:
-		// t = (ga >> 16) == m_env.aref;
-		movdqa(xmm1, xmm6);
-		psrld(xmm1, 16);
-		pcmpeqd(xmm1, xmmword[&m_env.aref]);
-		break;
+		case ATST_NOTEQUAL:
+			// t = (ga >> 16) == m_env.aref;
+			vpsrld(xmm1, xmm6, 16);
+			vpcmpeqd(xmm1, ptr[&m_env.aref]);
+			break;
+		}
+
+		switch(m_sel.afail)
+		{
+		case AFAIL_KEEP:
+			// test |= t;
+			vpor(xmm7, xmm1);
+			alltrue();
+			break;
+
+		case AFAIL_FB_ONLY:
+			// zm |= t;
+			vpor(xmm4, xmm1);
+			break;
+
+		case AFAIL_ZB_ONLY:
+			// fm |= t;
+			vpor(xmm3, xmm1);
+			break;
+
+		case AFAIL_RGB_ONLY:
+			// zm |= t;
+			vpor(xmm4, xmm1);
+			// fm |= t & GSVector4i::xff000000();
+			vpsrld(xmm1, 24);
+			vpslld(xmm1, 24);
+			vpor(xmm3, xmm1);
+			break;
+		}
 	}
-
-	switch(m_sel.afail)
+	else
 	{
-	case AFAIL_KEEP:
-		// test |= t;
-		por(xmm7, xmm1);
-		alltrue();
-		break;
+		switch(m_sel.atst)
+		{
+		case ATST_NEVER:
+			// t = GSVector4i::xffffffff();
+			pcmpeqd(xmm1, xmm1);
+			break;
 
-	case AFAIL_FB_ONLY:
-		// zm |= t;
-		por(xmm4, xmm1);
-		break;
+		case ATST_ALWAYS:
+			return;
 
-	case AFAIL_ZB_ONLY:
-		// fm |= t;
-		por(xmm3, xmm1);
-		break;
+		case ATST_LESS:
+		case ATST_LEQUAL:
+			// t = (ga >> 16) > m_env.aref;
+			movdqa(xmm1, xmm6);
+			psrld(xmm1, 16);
+			pcmpgtd(xmm1, ptr[&m_env.aref]);
+			break;
 
-	case AFAIL_RGB_ONLY:
-		// zm |= t;
-		por(xmm4, xmm1);
-		// fm |= t & GSVector4i::xff000000();
-		psrld(xmm1, 24);
-		pslld(xmm1, 24);
-		por(xmm3, xmm1);
-		break;
+		case ATST_EQUAL:
+			// t = (ga >> 16) != m_env.aref;
+			movdqa(xmm1, xmm6);
+			psrld(xmm1, 16);
+			pcmpeqd(xmm1, ptr[&m_env.aref]);
+			pcmpeqd(xmm0, xmm0);
+			pxor(xmm1, xmm0);
+			break;
+
+		case ATST_GEQUAL:
+		case ATST_GREATER:
+			// t = (ga >> 16) < m_env.aref;
+			movdqa(xmm0, xmm6);
+			psrld(xmm0, 16);
+			movdqa(xmm1, ptr[&m_env.aref]);
+			pcmpgtd(xmm1, xmm0);
+			break;
+
+		case ATST_NOTEQUAL:
+			// t = (ga >> 16) == m_env.aref;
+			movdqa(xmm1, xmm6);
+			psrld(xmm1, 16);
+			pcmpeqd(xmm1, ptr[&m_env.aref]);
+			break;
+		}
+
+		switch(m_sel.afail)
+		{
+		case AFAIL_KEEP:
+			// test |= t;
+			por(xmm7, xmm1);
+			alltrue();
+			break;
+
+		case AFAIL_FB_ONLY:
+			// zm |= t;
+			por(xmm4, xmm1);
+			break;
+
+		case AFAIL_ZB_ONLY:
+			// fm |= t;
+			por(xmm3, xmm1);
+			break;
+
+		case AFAIL_RGB_ONLY:
+			// zm |= t;
+			por(xmm4, xmm1);
+			// fm |= t & GSVector4i::xff000000();
+			psrld(xmm1, 24);
+			pslld(xmm1, 24);
+			por(xmm3, xmm1);
+			break;
+		}
 	}
 }
 
@@ -1412,72 +2515,145 @@ void GSDrawScanlineCodeGenerator::ColorTFX()
 		return;
 	}
 
-	switch(m_sel.tfx)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-	case TFX_MODULATE:
-
-		// GSVector4i rb = iip ? rbf : m_env.c.rb;
-
-		// rbt = rbt.modulate16<1>(rb).clamp8();
-
-		modulate16<1>(xmm5, xmmword[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]);
-
-		clamp16(xmm5, xmm1);
-
-		break;
-
-	case TFX_DECAL:
-
-		break;
-
-	case TFX_HIGHLIGHT:
-	case TFX_HIGHLIGHT2:
-
-		if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc)
+		switch(m_sel.tfx)
 		{
-			// GSVector4i ga = iip ? gaf : m_env.c.ga;
+		case TFX_MODULATE:
 
-			movdqa(xmm2, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+			// GSVector4i rb = iip ? rbf : m_env.c.rb;
+
+			// rbt = rbt.modulate16<1>(rb).clamp8();
+
+			modulate16<1>(xmm5, ptr[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]);
+
+			clamp16(xmm5, xmm1);
+
+			break;
+
+		case TFX_DECAL:
+
+			break;
+
+		case TFX_HIGHLIGHT:
+		case TFX_HIGHLIGHT2:
+
+			if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc)
+			{
+				// GSVector4i ga = iip ? gaf : m_env.c.ga;
+
+				vmovdqa(xmm2, ptr[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+			}
+
+			// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
+
+			vmovdqa(xmm1, xmm6);
+
+			modulate16<1>(xmm6, xmm2);
+
+			vpshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
+			vpshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
+			vpsrlw(xmm2, 7);
+
+			vpaddw(xmm6, xmm2);
+
+			clamp16(xmm6, xmm0);
+
+			mix16(xmm6, xmm1, xmm0);
+
+			// GSVector4i rb = iip ? rbf : m_env.c.rb;
+
+			// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
+
+			modulate16<1>(xmm5, ptr[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]);
+
+			vpaddw(xmm5, xmm2);
+
+			clamp16(xmm5, xmm0);
+
+			break;
+
+		case TFX_NONE:
+
+			// rbt = iip ? rb.srl16(7) : rb;
+
+			if(m_sel.iip)
+			{
+				vpsrlw(xmm5, 7);
+			}
+
+			break;
 		}
-
-		// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
-
-		movdqa(xmm1, xmm6);
-
-		modulate16<1>(xmm6, xmm2);
-
-		pshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
-		pshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
-		psrlw(xmm2, 7);
-
-		paddw(xmm6, xmm2);
-
-		clamp16(xmm6, xmm0);
-
-		mix16(xmm6, xmm1, xmm0);
-
-		// GSVector4i rb = iip ? rbf : m_env.c.rb;
-
-		// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
-
-		modulate16<1>(xmm5, xmmword[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]);
-
-		paddw(xmm5, xmm2);
-
-		clamp16(xmm5, xmm0);
-
-		break;
-
-	case TFX_NONE:
-
-		// rbt = iip ? rb.srl16(7) : rb;
-
-		if(m_sel.iip)
+	}
+	else
+	{
+		switch(m_sel.tfx)
 		{
-			psrlw(xmm5, 7);
-		}
+		case TFX_MODULATE:
 
-		break;
+			// GSVector4i rb = iip ? rbf : m_env.c.rb;
+
+			// rbt = rbt.modulate16<1>(rb).clamp8();
+
+			modulate16<1>(xmm5, ptr[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]);
+
+			clamp16(xmm5, xmm1);
+
+			break;
+
+		case TFX_DECAL:
+
+			break;
+
+		case TFX_HIGHLIGHT:
+		case TFX_HIGHLIGHT2:
+
+			if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc)
+			{
+				// GSVector4i ga = iip ? gaf : m_env.c.ga;
+
+				movdqa(xmm2, ptr[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
+			}
+
+			// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
+
+			movdqa(xmm1, xmm6);
+
+			modulate16<1>(xmm6, xmm2);
+
+			pshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
+			pshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
+			psrlw(xmm2, 7);
+
+			paddw(xmm6, xmm2);
+
+			clamp16(xmm6, xmm0);
+
+			mix16(xmm6, xmm1, xmm0);
+
+			// GSVector4i rb = iip ? rbf : m_env.c.rb;
+
+			// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
+
+			modulate16<1>(xmm5, ptr[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]);
+
+			paddw(xmm5, xmm2);
+
+			clamp16(xmm5, xmm0);
+
+			break;
+
+		case TFX_NONE:
+
+			// rbt = iip ? rb.srl16(7) : rb;
+
+			if(m_sel.iip)
+			{
+				psrlw(xmm5, 7);
+			}
+
+			break;
+		}
 	}
 }
 
@@ -1488,19 +2664,36 @@ void GSDrawScanlineCodeGenerator::Fog()
 		return;
 	}
 
-	// rb = m_env.frb.lerp16<0>(rb, f);
-	// ga = m_env.fga.lerp16<0>(ga, f).mix16(ga);
+	if(m_cpu.has(util::Cpu::tAVX))
+	{
+		// rb = m_env.frb.lerp16<0>(rb, f);
+		// ga = m_env.fga.lerp16<0>(ga, f).mix16(ga);
 
-	movdqa(xmm0, xmmword[!m_sel.sprite ? &m_env.temp.f : &m_env.p.f]);
-	movdqa(xmm1, xmm6);
+		vmovdqa(xmm0, ptr[!m_sel.sprite ? &m_env.temp.f : &m_env.p.f]);
+		vmovdqa(xmm1, xmm6);
 
-	movdqa(xmm2, xmmword[&m_env.frb]);
-	lerp16<0>(xmm5, xmm2, xmm0);
+		vmovdqa(xmm2, ptr[&m_env.frb]);
+		lerp16<0>(xmm5, xmm2, xmm0);
 
-	movdqa(xmm2, xmmword[&m_env.fga]);
-	lerp16<0>(xmm6, xmm2, xmm0);
+		vmovdqa(xmm2, ptr[&m_env.fga]);
+		lerp16<0>(xmm6, xmm2, xmm0);
+		mix16(xmm6, xmm1, xmm0);
+	}
+	else
+	{
+		// rb = m_env.frb.lerp16<0>(rb, f);
+		// ga = m_env.fga.lerp16<0>(ga, f).mix16(ga);
 
-	mix16(xmm6, xmm1, xmm0);
+		movdqa(xmm0, ptr[!m_sel.sprite ? &m_env.temp.f : &m_env.p.f]);
+		movdqa(xmm1, xmm6);
+
+		movdqa(xmm2, ptr[&m_env.frb]);
+		lerp16<0>(xmm5, xmm2, xmm0);
+
+		movdqa(xmm2, ptr[&m_env.fga]);
+		lerp16<0>(xmm6, xmm2, xmm0);
+		mix16(xmm6, xmm1, xmm0);
+	}
 }
 
 void GSDrawScanlineCodeGenerator::ReadFrame()
@@ -1530,37 +2723,74 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
 		return;
 	}
 
-	// test |= ((fd [<< 16]) ^ m_env.datm).sra32(31);
-
-	movdqa(xmm1, xmm2);
-
-	if(m_sel.datm)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		if(m_sel.fpsm == 2)
+		// test |= ((fd [<< 16]) ^ m_env.datm).sra32(31);
+
+		if(m_sel.datm)
 		{
-			pxor(xmm0, xmm0);
-			psrld(xmm1, 15);
-			pcmpeqd(xmm1, xmm0);
+			if(m_sel.fpsm == 2)
+			{
+				vpxor(xmm0, xmm0);
+				vpsrld(xmm1, xmm2, 15);
+				vpcmpeqd(xmm1, xmm0);
+			}
+			else
+			{
+				vpcmpeqd(xmm0, xmm0);
+				vpxor(xmm1, xmm2, xmm0);
+				vpsrad(xmm1, 31);
+			}
 		}
 		else
 		{
-			pcmpeqd(xmm0, xmm0);
-			pxor(xmm1, xmm0);
-			psrad(xmm1, 31);
+			if(m_sel.fpsm == 2)
+			{
+				vpslld(xmm1, xmm2, 16);
+				vpsrad(xmm1, 31);
+			}
+			else
+			{
+				vpsrad(xmm1, xmm2, 31);
+			}
 		}
+
+		vpor(xmm7, xmm1);
 	}
 	else
 	{
-		if(m_sel.fpsm == 2)
+		// test |= ((fd [<< 16]) ^ m_env.datm).sra32(31);
+
+		movdqa(xmm1, xmm2);
+
+		if(m_sel.datm)
 		{
-			pslld(xmm1, 16);
+			if(m_sel.fpsm == 2)
+			{
+				pxor(xmm0, xmm0);
+				psrld(xmm1, 15);
+				pcmpeqd(xmm1, xmm0);
+			}
+			else
+			{
+				pcmpeqd(xmm0, xmm0);
+				pxor(xmm1, xmm0);
+				psrad(xmm1, 31);
+			}
+		}
+		else
+		{
+			if(m_sel.fpsm == 2)
+			{
+				pslld(xmm1, 16);
+			}
+
+			psrad(xmm1, 31);
 		}
 
-		psrad(xmm1, 31);
+		por(xmm7, xmm1);
 	}
 
-	por(xmm7, xmm1);
-
 	alltrue();
 }
 
@@ -1571,19 +2801,31 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
 		return;
 	}
 
-	movdqa(xmm1, xmmword[!m_sel.sprite ? &m_env.temp.zs : &m_env.p.z]);
+	bool fast = m_sel.ztest && m_sel.zpsm < 2;
 
-	bool fast = false;
-
-	if(m_sel.ztest && m_sel.zpsm < 2)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		// zs = zs.blend8(zd, zm);
+		vmovdqa(xmm1, ptr[!m_sel.sprite ? &m_env.temp.zs : &m_env.p.z]);
 
-		movdqa(xmm0, xmm4);
-		movdqa(xmm7, xmmword[&m_env.temp.zd]);
-		blend8(xmm1, xmm7);
+		if(fast)
+		{
+			// zs = zs.blend8(zd, zm);
 
-		fast = true;
+			vpblendvb(xmm1, ptr[&m_env.temp.zd], xmm4);
+		}
+	}
+	else
+	{
+		movdqa(xmm1, ptr[!m_sel.sprite ? &m_env.temp.zs : &m_env.p.z]);
+
+		if(fast)
+		{
+			// zs = zs.blend8(zd, zm);
+
+			movdqa(xmm0, xmm4);
+			movdqa(xmm7, ptr[&m_env.temp.zd]);
+			blend8(xmm1, xmm7);
+		}
 	}
 
 	WritePixel(xmm1, xmm0, ebp, dh, fast, m_sel.zpsm);
@@ -1601,233 +2843,449 @@ void GSDrawScanlineCodeGenerator::AlphaBlend()
 		return;
 	}
 
-	if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		switch(m_sel.fpsm)
+		if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
 		{
-		case 0:
-		case 1:
-
-			// c[2] = fd & mask;
-			// c[3] = (fd >> 8) & mask;
-
-			movdqa(xmm0, xmm2);
-			movdqa(xmm1, xmm2);
-
-			psllw(xmm0, 8);
-			psrlw(xmm0, 8);
-			psrlw(xmm1, 8);
-
-			break;
-
-		case 2:
-
-			// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
-			// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
-
-			movdqa(xmm0, xmm2);
-			movdqa(xmm1, xmm2);
-			movdqa(xmm4, xmm2);
-
-			pcmpeqd(xmm7, xmm7);
-			psrld(xmm7, 27); // 0x0000001f
-			pand(xmm0, xmm7);
-			pslld(xmm0, 3);
-
-			pslld(xmm7, 10); // 0x00007c00
-			pand(xmm4, xmm7);
-			pslld(xmm4, 9);
-
-			por(xmm0, xmm4);
-
-			movdqa(xmm4, xmm1);
-
-			psrld(xmm7, 5); // 0x000003e0
-			pand(xmm1, xmm7);
-			psrld(xmm1, 2);
-
-			psllw(xmm7, 10); // 0x00008000
-			pand(xmm4, xmm7);
-			pslld(xmm4, 8);
-
-			por(xmm1, xmm4);
-
-			break;
-		}
-	}
-
-	// xmm5, xmm6 = src rb, ga
-	// xmm0, xmm1 = dst rb, ga
-	// xmm2, xmm3 = used
-	// xmm4, xmm7 = free
-
-	if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
-	{
-		movdqa(xmm4, xmm5);
-	}
-
-	if(m_sel.aba != m_sel.abb)
-	{
-		// rb = c[aba * 2 + 0];
-
-		switch(m_sel.aba)
-		{
-		case 0: break;
-		case 1: movdqa(xmm5, xmm0); break;
-		case 2: pxor(xmm5, xmm5); break;
-		}
-
-		// rb = rb.sub16(c[abb * 2 + 0]);
-
-		switch(m_sel.abb)
-		{
-		case 0: psubw(xmm5, xmm4); break;
-		case 1: psubw(xmm5, xmm0); break;
-		case 2: break;
-		}
-
-		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
-		{
-			// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_env.afix;
-
-			switch(m_sel.abc)
+			switch(m_sel.fpsm)
 			{
 			case 0:
 			case 1:
-				movdqa(xmm7, m_sel.abc ? xmm1 : xmm6);
-				pshuflw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
-				pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
-				psllw(xmm7, 7);
+
+				// c[2] = fd & mask;
+				// c[3] = (fd >> 8) & mask;
+
+				vpsllw(xmm0, xmm2, 8);
+				vpsrlw(xmm0, 8);
+				vpsrlw(xmm1, xmm2, 8);
+
 				break;
+
 			case 2:
-				movdqa(xmm7, xmmword[&m_env.afix]);
+
+				// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
+				// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
+
+				vpcmpeqd(xmm7, xmm7);
+
+				vpsrld(xmm7, 27); // 0x0000001f
+				vpand(xmm0, xmm2, xmm7);
+				vpslld(xmm0, 3);
+
+				vpslld(xmm7, 10); // 0x00007c00
+				vpand(xmm4, xmm2, xmm7);
+				vpslld(xmm4, 9);
+
+				vpor(xmm0, xmm4);
+
+				vpsrld(xmm7, 5); // 0x000003e0
+				vpand(xmm1, xmm2, xmm7);
+				vpsrld(xmm1, 2);
+
+				vpsllw(xmm7, 10); // 0x00008000
+				vpand(xmm4, xmm2, xmm7);
+				vpslld(xmm4, 8);
+
+				vpor(xmm1, xmm4);
+
 				break;
 			}
-
-			// rb = rb.modulate16<1>(a);
-
-			modulate16<1>(xmm5, xmm7);
 		}
 
-		// rb = rb.add16(c[abd * 2 + 0]);
+		// xmm5, xmm6 = src rb, ga
+		// xmm0, xmm1 = dst rb, ga
+		// xmm2, xmm3 = used
+		// xmm4, xmm7 = free
 
-		switch(m_sel.abd)
+		if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
 		{
-		case 0: paddw(xmm5, xmm4); break;
-		case 1: paddw(xmm5, xmm0); break;
-		case 2: break;
+			vmovdqa(xmm4, xmm5);
+		}
+
+		if(m_sel.aba != m_sel.abb)
+		{
+			// rb = c[aba * 2 + 0];
+
+			switch(m_sel.aba)
+			{
+			case 0: break;
+			case 1: vmovdqa(xmm5, xmm0); break;
+			case 2: vpxor(xmm5, xmm5); break;
+			}
+
+			// rb = rb.sub16(c[abb * 2 + 0]);
+
+			switch(m_sel.abb)
+			{
+			case 0: vpsubw(xmm5, xmm4); break;
+			case 1: vpsubw(xmm5, xmm0); break;
+			case 2: break;
+			}
+
+			if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+			{
+				// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_env.afix;
+
+				switch(m_sel.abc)
+				{
+				case 0:
+				case 1:
+					vpshuflw(xmm7, m_sel.abc ? xmm1 : xmm6, _MM_SHUFFLE(3, 3, 1, 1));
+					vpshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
+					vpsllw(xmm7, 7);
+					break;
+				case 2:
+					vmovdqa(xmm7, ptr[&m_env.afix]);
+					break;
+				}
+
+				// rb = rb.modulate16<1>(a);
+
+				modulate16<1>(xmm5, xmm7);
+			}
+
+			// rb = rb.add16(c[abd * 2 + 0]);
+
+			switch(m_sel.abd)
+			{
+			case 0: vpaddw(xmm5, xmm4); break;
+			case 1: vpaddw(xmm5, xmm0); break;
+			case 2: break;
+			}
+		}
+		else
+		{
+			// rb = c[abd * 2 + 0];
+
+			switch(m_sel.abd)
+			{
+			case 0: break;
+			case 1: vmovdqa(xmm5, xmm0); break;
+			case 2: vpxor(xmm5, xmm5); break;
+			}
+		}
+
+		if(m_sel.pabe)
+		{
+			// mask = (c[1] << 8).sra32(31);
+
+			vpslld(xmm0, xmm6, 8);
+			vpsrad(xmm0, 31);
+
+			// rb = c[0].blend8(rb, mask);
+
+			vpblendvb(xmm5, xmm4, xmm5, xmm0);
+		}
+
+		// xmm6 = src ga
+		// xmm1 = dst ga
+		// xmm5 = rb
+		// xmm7 = a
+		// xmm2, xmm3 = used
+		// xmm0, xmm4 = free
+
+		vmovdqa(xmm4, xmm6);
+
+		if(m_sel.aba != m_sel.abb)
+		{
+			// ga = c[aba * 2 + 1];
+
+			switch(m_sel.aba)
+			{
+			case 0: break;
+			case 1: vmovdqa(xmm6, xmm1); break;
+			case 2: vpxor(xmm6, xmm6); break;
+			}
+
+			// ga = ga.sub16(c[abeb * 2 + 1]);
+
+			switch(m_sel.abb)
+			{
+			case 0: vpsubw(xmm6, xmm4); break;
+			case 1: vpsubw(xmm6, xmm1); break;
+			case 2: break;
+			}
+
+			if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+			{
+				// ga = ga.modulate16<1>(a);
+
+				modulate16<1>(xmm6, xmm7);
+			}
+
+			// ga = ga.add16(c[abd * 2 + 1]);
+
+			switch(m_sel.abd)
+			{
+			case 0: vpaddw(xmm6, xmm4); break;
+			case 1: vpaddw(xmm6, xmm1); break;
+			case 2: break;
+			}
+		}
+		else
+		{
+			// ga = c[abd * 2 + 1];
+
+			switch(m_sel.abd)
+			{
+			case 0: break;
+			case 1: vmovdqa(xmm6, xmm1); break;
+			case 2: vpxor(xmm6, xmm6); break;
+			}
+		}
+
+		// xmm4 = src ga
+		// xmm5 = rb
+		// xmm6 = ga
+		// xmm2, xmm3 = used
+		// xmm0, xmm1, xmm7 = free
+
+		if(m_sel.pabe)
+		{
+			vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
+
+			// ga = c[1].blend8(ga, mask).mix16(c[1]);
+
+			vpblendvb(xmm6, xmm4, xmm6, xmm0);
+		}
+		else
+		{
+			if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
+			{
+				mix16(xmm6, xmm4, xmm7);
+			}
 		}
 	}
 	else
 	{
-		// rb = c[abd * 2 + 0];
-
-		switch(m_sel.abd)
+		if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
 		{
-		case 0: break;
-		case 1: movdqa(xmm5, xmm0); break;
-		case 2: pxor(xmm5, xmm5); break;
-		}
-	}
+			switch(m_sel.fpsm)
+			{
+			case 0:
+			case 1:
 
-	if(m_sel.pabe)
-	{
-		// mask = (c[1] << 8).sra32(31);
+				// c[2] = fd & mask;
+				// c[3] = (fd >> 8) & mask;
 
-		movdqa(xmm0, xmm6);
-		pslld(xmm0, 8);
-		psrad(xmm0, 31);
+				movdqa(xmm0, xmm2);
+				movdqa(xmm1, xmm2);
 
-		// rb = c[0].blend8(rb, mask);
+				psllw(xmm0, 8);
+				psrlw(xmm0, 8);
+				psrlw(xmm1, 8);
 
-		blend8r(xmm5, xmm4);
-	}
+				break;
 
-	// xmm6 = src ga
-	// xmm1 = dst ga
-	// xmm5 = rb
-	// xmm7 = a
-	// xmm2, xmm3 = used
-	// xmm0, xmm4 = free
+			case 2:
 
-	movdqa(xmm4, xmm6);
+				// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
+				// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
 
-	if(m_sel.aba != m_sel.abb)
-	{
-		// ga = c[aba * 2 + 1];
+				movdqa(xmm0, xmm2);
+				movdqa(xmm1, xmm2);
+				movdqa(xmm4, xmm2);
 
-		switch(m_sel.aba)
-		{
-		case 0: break;
-		case 1: movdqa(xmm6, xmm1); break;
-		case 2: pxor(xmm6, xmm6); break;
+				pcmpeqd(xmm7, xmm7);
+				psrld(xmm7, 27); // 0x0000001f
+				pand(xmm0, xmm7);
+				pslld(xmm0, 3);
+
+				pslld(xmm7, 10); // 0x00007c00
+				pand(xmm4, xmm7);
+				pslld(xmm4, 9);
+
+				por(xmm0, xmm4);
+
+				movdqa(xmm4, xmm1);
+
+				psrld(xmm7, 5); // 0x000003e0
+				pand(xmm1, xmm7);
+				psrld(xmm1, 2);
+
+				psllw(xmm7, 10); // 0x00008000
+				pand(xmm4, xmm7);
+				pslld(xmm4, 8);
+
+				por(xmm1, xmm4);
+
+				break;
+			}
 		}
 
-		// ga = ga.sub16(c[abeb * 2 + 1]);
+		// xmm5, xmm6 = src rb, ga
+		// xmm0, xmm1 = dst rb, ga
+		// xmm2, xmm3 = used
+		// xmm4, xmm7 = free
 
-		switch(m_sel.abb)
+		if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
 		{
-		case 0: psubw(xmm6, xmm4); break;
-		case 1: psubw(xmm6, xmm1); break;
-		case 2: break;
+			movdqa(xmm4, xmm5);
 		}
 
-		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		if(m_sel.aba != m_sel.abb)
 		{
-			// ga = ga.modulate16<1>(a);
+			// rb = c[aba * 2 + 0];
 
-			modulate16<1>(xmm6, xmm7);
+			switch(m_sel.aba)
+			{
+			case 0: break;
+			case 1: movdqa(xmm5, xmm0); break;
+			case 2: pxor(xmm5, xmm5); break;
+			}
+
+			// rb = rb.sub16(c[abb * 2 + 0]);
+
+			switch(m_sel.abb)
+			{
+			case 0: psubw(xmm5, xmm4); break;
+			case 1: psubw(xmm5, xmm0); break;
+			case 2: break;
+			}
+
+			if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+			{
+				// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_env.afix;
+
+				switch(m_sel.abc)
+				{
+				case 0:
+				case 1:
+					movdqa(xmm7, m_sel.abc ? xmm1 : xmm6);
+					pshuflw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
+					pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
+					psllw(xmm7, 7);
+					break;
+				case 2:
+					movdqa(xmm7, ptr[&m_env.afix]);
+					break;
+				}
+
+				// rb = rb.modulate16<1>(a);
+
+				modulate16<1>(xmm5, xmm7);
+			}
+
+			// rb = rb.add16(c[abd * 2 + 0]);
+
+			switch(m_sel.abd)
+			{
+			case 0: paddw(xmm5, xmm4); break;
+			case 1: paddw(xmm5, xmm0); break;
+			case 2: break;
+			}
+		}
+		else
+		{
+			// rb = c[abd * 2 + 0];
+
+			switch(m_sel.abd)
+			{
+			case 0: break;
+			case 1: movdqa(xmm5, xmm0); break;
+			case 2: pxor(xmm5, xmm5); break;
+			}
 		}
 
-		// ga = ga.add16(c[abd * 2 + 1]);
-
-		switch(m_sel.abd)
+		if(m_sel.pabe)
 		{
-		case 0: paddw(xmm6, xmm4); break;
-		case 1: paddw(xmm6, xmm1); break;
-		case 2: break;
-		}
-	}
-	else
-	{
-		// ga = c[abd * 2 + 1];
+			// mask = (c[1] << 8).sra32(31);
 
-		switch(m_sel.abd)
-		{
-		case 0: break;
-		case 1: movdqa(xmm6, xmm1); break;
-		case 2: pxor(xmm6, xmm6); break;
-		}
-	}
-
-	// xmm4 = src ga
-	// xmm5 = rb
-	// xmm6 = ga
-	// xmm2, xmm3 = used
-	// xmm0, xmm1, xmm7 = free
-
-	if(m_sel.pabe)
-	{
-		if(!m_cpu.has(util::Cpu::tSSE41))
-		{
-			// doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb)
-
-			movdqa(xmm0, xmm4);
+			movdqa(xmm0, xmm6);
 			pslld(xmm0, 8);
 			psrad(xmm0, 31);
+
+			// rb = c[0].blend8(rb, mask);
+
+			blend8r(xmm5, xmm4);
 		}
 
-		psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
+		// xmm6 = src ga
+		// xmm1 = dst ga
+		// xmm5 = rb
+		// xmm7 = a
+		// xmm2, xmm3 = used
+		// xmm0, xmm4 = free
 
-		// ga = c[1].blend8(ga, mask).mix16(c[1]);
+		movdqa(xmm4, xmm6);
 
-		blend8r(xmm6, xmm4);
-	}
-	else
-	{
-		if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
+		if(m_sel.aba != m_sel.abb)
 		{
-			mix16(xmm6, xmm4, xmm7);
+			// ga = c[aba * 2 + 1];
+
+			switch(m_sel.aba)
+			{
+			case 0: break;
+			case 1: movdqa(xmm6, xmm1); break;
+			case 2: pxor(xmm6, xmm6); break;
+			}
+
+			// ga = ga.sub16(c[abeb * 2 + 1]);
+
+			switch(m_sel.abb)
+			{
+			case 0: psubw(xmm6, xmm4); break;
+			case 1: psubw(xmm6, xmm1); break;
+			case 2: break;
+			}
+
+			if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+			{
+				// ga = ga.modulate16<1>(a);
+
+				modulate16<1>(xmm6, xmm7);
+			}
+
+			// ga = ga.add16(c[abd * 2 + 1]);
+
+			switch(m_sel.abd)
+			{
+			case 0: paddw(xmm6, xmm4); break;
+			case 1: paddw(xmm6, xmm1); break;
+			case 2: break;
+			}
+		}
+		else
+		{
+			// ga = c[abd * 2 + 1];
+
+			switch(m_sel.abd)
+			{
+			case 0: break;
+			case 1: movdqa(xmm6, xmm1); break;
+			case 2: pxor(xmm6, xmm6); break;
+			}
+		}
+
+		// xmm4 = src ga
+		// xmm5 = rb
+		// xmm6 = ga
+		// xmm2, xmm3 = used
+		// xmm0, xmm1, xmm7 = free
+
+		if(m_sel.pabe)
+		{
+			if(!m_cpu.has(util::Cpu::tSSE41))
+			{
+				// doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb)
+
+				movdqa(xmm0, xmm4);
+				pslld(xmm0, 8);
+				psrad(xmm0, 31);
+			}
+
+			psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
+
+			// ga = c[1].blend8(ga, mask).mix16(c[1]);
+
+			blend8r(xmm6, xmm4);
+		}
+		else
+		{
+			if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
+			{
+				mix16(xmm6, xmm4, xmm7);
+			}
 		}
 	}
 }
@@ -1841,72 +3299,140 @@ void GSDrawScanlineCodeGenerator::WriteFrame(int params)
 		return;
 	}
 
-	if(m_sel.colclamp == 0)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		// c[0] &= 0x000000ff;
-		// c[1] &= 0x000000ff;
+		if(m_sel.colclamp == 0)
+		{
+			// c[0] &= 0x000000ff;
+			// c[1] &= 0x000000ff;
 
-		pcmpeqd(xmm7, xmm7);
-		psrlw(xmm7, 8);
-		pand(xmm5, xmm7);
-		pand(xmm6, xmm7);
+			vpcmpeqd(xmm7, xmm7);
+			vpsrlw(xmm7, 8);
+			vpand(xmm5, xmm7);
+			vpand(xmm6, xmm7);
+		}
+
+		if(m_sel.fpsm == 2 && m_sel.dthe)
+		{
+			mov(eax, dword[esp + _top]);
+			and(eax, 3);
+			shl(eax, 5);
+			vpaddw(xmm5, ptr[eax + (size_t)&m_env.dimx[0]]);
+			vpaddw(xmm6, ptr[eax + (size_t)&m_env.dimx[1]]);
+		}
+
+		// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
+
+		vpunpckhwd(xmm7, xmm5, xmm6);
+		vpunpcklwd(xmm5, xmm6);
+		vpackuswb(xmm5, xmm7);
+
+		if(m_sel.fba && m_sel.fpsm != 1)
+		{
+			// fs |= 0x80000000;
+
+			vpcmpeqd(xmm7, xmm7);
+			vpslld(xmm7, 31);
+			vpor(xmm5, xmm7);
+		}
+
+		if(m_sel.fpsm == 2)
+		{
+			// GSVector4i rb = fs & 0x00f800f8;
+			// GSVector4i ga = fs & 0x8000f800;
+
+			mov(eax, 0x00f800f8);
+			vmovd(xmm6, eax);
+			vpshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+
+			mov(eax, 0x8000f800);
+			vmovd(xmm7, eax);
+			vpshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
+
+			vpand(xmm4, xmm5, xmm6);
+			vpand(xmm5, xmm7);
+
+			// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
+
+			vpsrld(xmm6, xmm4, 9);
+			vpsrld(xmm4, 3);
+			vpsrld(xmm7, xmm5, 16);
+			vpsrld(xmm5, 6);
+
+			vpor(xmm5, xmm4);
+			vpor(xmm7, xmm6);
+			vpor(xmm5, xmm7);
+		}
 	}
-
-	if(m_sel.fpsm == 2 && m_sel.dthe)
+	else
 	{
-		mov(eax, dword[esp + _top]);
-		and(eax, 3);
-		shl(eax, 5);
-		paddw(xmm5, xmmword[eax + (size_t)&m_env.dimx[0]]);
-		paddw(xmm6, xmmword[eax + (size_t)&m_env.dimx[1]]);
-	}
+		if(m_sel.colclamp == 0)
+		{
+			// c[0] &= 0x000000ff;
+			// c[1] &= 0x000000ff;
 
-	// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
+			pcmpeqd(xmm7, xmm7);
+			psrlw(xmm7, 8);
+			pand(xmm5, xmm7);
+			pand(xmm6, xmm7);
+		}
 
-	movdqa(xmm7, xmm5);
-	punpcklwd(xmm5, xmm6);
-	punpckhwd(xmm7, xmm6);
-	packuswb(xmm5, xmm7);
+		if(m_sel.fpsm == 2 && m_sel.dthe)
+		{
+			mov(eax, dword[esp + _top]);
+			and(eax, 3);
+			shl(eax, 5);
+			paddw(xmm5, ptr[eax + (size_t)&m_env.dimx[0]]);
+			paddw(xmm6, ptr[eax + (size_t)&m_env.dimx[1]]);
+		}
 
-	if(m_sel.fba && m_sel.fpsm != 1)
-	{
-		// fs |= 0x80000000;
+		// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
 
-		pcmpeqd(xmm7, xmm7);
-		pslld(xmm7, 31);
-		por(xmm5, xmm7);
-	}
-
-	if(m_sel.fpsm == 2)
-	{
-		// GSVector4i rb = fs & 0x00f800f8;
-		// GSVector4i ga = fs & 0x8000f800;
-
-		mov(eax, 0x00f800f8);
-		movd(xmm6, eax);
-		pshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
-
-		mov(eax, 0x8000f800);
-		movd(xmm7, eax);
-		pshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
-
-		movdqa(xmm4, xmm5);
-		pand(xmm4, xmm6);
-		pand(xmm5, xmm7);
-
-		// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
-
-		movdqa(xmm6, xmm4);
 		movdqa(xmm7, xmm5);
+		punpcklwd(xmm5, xmm6);
+		punpckhwd(xmm7, xmm6);
+		packuswb(xmm5, xmm7);
 
-		psrld(xmm4, 3);
-		psrld(xmm6, 9);
-		psrld(xmm5, 6);
-		psrld(xmm7, 16);
+		if(m_sel.fba && m_sel.fpsm != 1)
+		{
+			// fs |= 0x80000000;
 
-		por(xmm5, xmm4);
-		por(xmm7, xmm6);
-		por(xmm5, xmm7);
+			pcmpeqd(xmm7, xmm7);
+			pslld(xmm7, 31);
+			por(xmm5, xmm7);
+		}
+
+		if(m_sel.fpsm == 2)
+		{
+			// GSVector4i rb = fs & 0x00f800f8;
+			// GSVector4i ga = fs & 0x8000f800;
+
+			mov(eax, 0x00f800f8);
+			movd(xmm6, eax);
+			pshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+
+			mov(eax, 0x8000f800);
+			movd(xmm7, eax);
+			pshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
+
+			movdqa(xmm4, xmm5);
+			pand(xmm4, xmm6);
+			pand(xmm5, xmm7);
+
+			// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
+
+			movdqa(xmm6, xmm4);
+			movdqa(xmm7, xmm5);
+
+			psrld(xmm4, 3);
+			psrld(xmm6, 9);
+			psrld(xmm5, 6);
+			psrld(xmm7, 16);
+
+			por(xmm5, xmm4);
+			por(xmm7, xmm6);
+			por(xmm5, xmm7);
+		}
 	}
 
 	if(m_sel.rfb)
@@ -1923,8 +3449,16 @@ void GSDrawScanlineCodeGenerator::WriteFrame(int params)
 
 void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
 {
-	movq(dst, qword[addr * 2 + (size_t)m_env.vm]);
-	movhps(dst, qword[addr * 2 + (size_t)m_env.vm + 8 * 2]);
+	if(m_cpu.has(util::Cpu::tAVX))
+	{
+		movq(dst, qword[addr * 2 + (size_t)m_env.vm]);
+		vmovhps(dst, qword[addr * 2 + (size_t)m_env.vm + 8 * 2]);
+	}
+	else
+	{
+		movq(dst, qword[addr * 2 + (size_t)m_env.vm]);
+		movhps(dst, qword[addr * 2 + (size_t)m_env.vm + 8 * 2]);
+	}
 }
 
 void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Xmm& temp, const Reg32& addr, const Reg8& mask, bool fast, int psm)
@@ -1934,15 +3468,32 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Xmm& temp, co
 		// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
 		// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
 
-		test(mask, 0x0f);
-		je("@f");
-		movq(qword[addr * 2 + (size_t)m_env.vm], src);
-		L("@@");
+		if(m_cpu.has(util::Cpu::tAVX))
+		{
+			test(mask, 0x0f);
+			je("@f");
+			movq(qword[addr * 2 + (size_t)m_env.vm], src);
+			L("@@");
 
-		test(mask, 0xf0);
-		je("@f");
-		movhps(qword[addr * 2 + (size_t)m_env.vm + 8 * 2], src);
-		L("@@");
+			test(mask, 0xf0);
+			je("@f");
+			vmovhps(qword[addr * 2 + (size_t)m_env.vm + 8 * 2], src);
+			L("@@");
+
+			// vmaskmovps?
+		}
+		else
+		{
+			test(mask, 0x0f);
+			je("@f");
+			movq(qword[addr * 2 + (size_t)m_env.vm], src);
+			L("@@");
+
+			test(mask, 0xf0);
+			je("@f");
+			movhps(qword[addr * 2 + (size_t)m_env.vm + 8 * 2], src);
+			L("@@");
+		}
 	}
 	else
 	{
@@ -1979,7 +3530,28 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Xmm& temp, co
 
 	Address dst = ptr[addr * 2 + (size_t)m_env.vm + offsets[i] * 2];
 
-	if(m_cpu.has(util::Cpu::tSSE41))
+	if(m_cpu.has(util::Cpu::tAVX))
+	{
+		switch(psm)
+		{
+		case 0:
+			if(i == 0) vmovd(dst, src);
+			else vpextrd(dst, src, i);
+			break;
+		case 1:
+			if(i == 0) vmovd(eax, src);
+			else vpextrd(eax, src, i);
+			xor(eax, dst);
+			and(eax, 0xffffff);
+			xor(dst, eax);
+			break;
+		case 2:
+			pextrw(eax, src, i * 2); // vpextrw is broken in xbyak 2.99
+			mov(dst, ax);
+			break;
+		}
+	}
+	else if(m_cpu.has(util::Cpu::tSSE41))
 	{
 		switch(psm)
 		{
@@ -2052,20 +3624,33 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, con
 
 void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
 {
-	if(!m_cpu.has(util::Cpu::tSSE41) && i > 0)
-	{
-		ASSERT(0);
-	}
-
-	if(i == 0) movd(eax, addr);
-	else pextrd(eax, addr, i);
-
-	if(m_sel.tlu) movzx(eax, byte[ebx + eax]);
-
 	const Address& src = m_sel.tlu ? ptr[eax * 4 + (size_t)m_env.clut] : ptr[ebx + eax * 4];
 
-	if(i == 0) movd(dst, src);
-	else pinsrd(dst, src, i);
+	if(m_cpu.has(util::Cpu::tAVX))
+	{
+		if(i == 0) vmovd(eax, addr);
+		else vpextrd(eax, addr, i);
+
+		if(m_sel.tlu) movzx(eax, byte[ebx + eax]);
+
+		if(i == 0) vmovd(dst, src);
+		else vpinsrd(dst, src, i);
+	}
+	else
+	{
+		if(!m_cpu.has(util::Cpu::tSSE41) && i > 0)
+		{
+			ASSERT(0);
+		}
+
+		if(i == 0) movd(eax, addr);
+		else pextrd(eax, addr, i);
+
+		if(m_sel.tlu) movzx(eax, byte[ebx + eax]);
+
+		if(i == 0) movd(dst, src);
+		else pinsrd(dst, src, i);
+	}
 }
 
 template<int shift>
@@ -2073,26 +3658,54 @@ void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f)
 {
 	if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3))
 	{
-		pmulhrsw(a, f);
+		if(m_cpu.has(util::Cpu::tAVX))
+		{
+			vpmulhrsw(a, f);
+		}
+		else
+		{
+			pmulhrsw(a, f);
+		}
 	}
 	else
 	{
-		psllw(a, shift + 1);
-		pmulhw(a, f);
+		if(m_cpu.has(util::Cpu::tAVX))
+		{
+			vpsllw(a, shift + 1);
+			vpmulhw(a, f);
+		}
+		else
+		{
+			psllw(a, shift + 1);
+			pmulhw(a, f);
+		}
 	}
 }
 
 template<int shift>
 void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f)
 {
-	psubw(a, b);
-	modulate16<shift>(a, f);
-	paddw(a, b);
+	if(m_cpu.has(util::Cpu::tAVX))
+	{
+		vpsubw(a, b);
+		modulate16<shift>(a, f);
+		vpaddw(a, b);
+	}
+	else
+	{
+		psubw(a, b);
+		modulate16<shift>(a, f);
+		paddw(a, b);
+	}
 }
 
 void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp)
 {
-	if(m_cpu.has(util::Cpu::tSSE41))
+	if(m_cpu.has(util::Cpu::tAVX))
+	{
+		vpblendw(a, b, 0xaa);
+	}
+	else if(m_cpu.has(util::Cpu::tSSE41))
 	{
 		pblendw(a, b, 0xaa);
 	}
@@ -2108,14 +3721,19 @@ void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& t
 
 void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
 {
-	packuswb(a, a);
-
-	if(m_cpu.has(util::Cpu::tSSE41))
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
+		vpackuswb(a, a);
+		vpmovzxbw(a, a);
+	}
+	else if(m_cpu.has(util::Cpu::tSSE41))
+	{
+		packuswb(a, a);
 		pmovzxbw(a, a);
 	}
 	else
 	{
+		packuswb(a, a);
 		pxor(temp, temp);
 		punpcklbw(a, temp);
 	}
@@ -2123,9 +3741,51 @@ void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
 
 void GSDrawScanlineCodeGenerator::alltrue()
 {
-	pmovmskb(eax, xmm7);
-	cmp(eax, 0xffff);
-	je("step", T_NEAR);
+	if(m_cpu.has(util::Cpu::tAVX))
+	{
+		vpmovmskb(eax, xmm7);
+		cmp(eax, 0xffff);
+		je("step", T_NEAR);
+	}
+	else
+	{
+		pmovmskb(eax, xmm7);
+		cmp(eax, 0xffff);
+		je("step", T_NEAR);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
+{
+	if(m_cpu.has(util::Cpu::tAVX))
+	{
+		vpand(b, mask);
+		vpandn(mask, a);
+		vpor(a, b, mask);
+	}
+	else
+	{
+		pand(b, mask);
+		pandn(mask, a);
+		por(b, mask);
+		movdqa(a, b);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
+{
+	if(m_cpu.has(util::Cpu::tAVX))
+	{
+		vpand(b, mask);
+		vpandn(mask, a);
+		vpor(b, mask);
+	}
+	else
+	{
+		pand(b, mask);
+		pandn(mask, a);
+		por(b, mask);
+	}
 }
 
 void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
@@ -2140,14 +3800,6 @@ void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
 	}
 }
 
-void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
-{
-	pand(b, mask);
-	pandn(mask, a);
-	por(b, mask);
-	movdqa(a, b);
-}
-
 void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
 {
 	if(m_cpu.has(util::Cpu::tSSE41))
@@ -2161,13 +3813,6 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
 	}
 }
 
-void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
-{
-	pand(b, mask);
-	pandn(mask, a);
-	por(b, mask);
-}
-
 const GSVector4i GSDrawScanlineCodeGenerator::m_test[8] =
 {
 	GSVector4i::zero(),
diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.h b/plugins/GSdx/GSDrawScanlineCodeGenerator.h
index 76358a8358..68b711adc1 100644
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h
@@ -67,10 +67,10 @@ class GSDrawScanlineCodeGenerator : public CodeGenerator
 	void mix16(const Xmm& a, const Xmm& b, const Xmm& temp);
 	void clamp16(const Xmm& a, const Xmm& temp);
 	void alltrue();
-	void blend8(const Xmm& a, const Xmm& b);
 	void blend(const Xmm& a, const Xmm& b, const Xmm& mask);
-	void blend8r(const Xmm& b, const Xmm& a);
 	void blendr(const Xmm& b, const Xmm& a, const Xmm& mask);
+	void blend8(const Xmm& a, const Xmm& b);
+	void blend8r(const Xmm& b, const Xmm& a);
 
 public:
 	GSDrawScanlineCodeGenerator(GSScanlineEnvironment& env, uint64 key, void* ptr, size_t maxsize);
diff --git a/plugins/GSdx/GSDrawingContext.h b/plugins/GSdx/GSDrawingContext.h
index 66efc2f3bb..2f75b2d5ec 100644
--- a/plugins/GSdx/GSDrawingContext.h
+++ b/plugins/GSdx/GSDrawingContext.h
@@ -26,7 +26,7 @@
 
 #pragma pack(push, 1)
 
-__aligned16 class GSDrawingContext
+__aligned32 class GSDrawingContext
 {
 public:
 	GIFRegXYOFFSET	XYOFFSET;
@@ -43,7 +43,7 @@ public:
 	GIFRegFRAME		FRAME;
 	GIFRegZBUF		ZBUF;
 
-	__aligned16 struct
+	__aligned32 struct
 	{
 		GSVector4i dx10;
 		GSVector4 dx9;
diff --git a/plugins/GSdx/GSDrawingEnvironment.h b/plugins/GSdx/GSDrawingEnvironment.h
index 982df9eace..4c36ba0c44 100644
--- a/plugins/GSdx/GSDrawingEnvironment.h
+++ b/plugins/GSdx/GSDrawingEnvironment.h
@@ -25,7 +25,7 @@
 
 #pragma pack(push, 1)
 
-__aligned16 class GSDrawingEnvironment
+__aligned32 class GSDrawingEnvironment
 {
 public:
 	GIFRegPRIM			PRIM;
diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp
index 6947c8d44e..b87130b02b 100644
--- a/plugins/GSdx/GSLocalMemory.cpp
+++ b/plugins/GSdx/GSLocalMemory.cpp
@@ -56,14 +56,14 @@ uint32 GSLocalMemory::pageOffset16SZ[32][64][64];
 uint32 GSLocalMemory::pageOffset8[32][64][128];
 uint32 GSLocalMemory::pageOffset4[32][128][128];
 
-int GSLocalMemory::rowOffset32[2048];
-int GSLocalMemory::rowOffset32Z[2048];
-int GSLocalMemory::rowOffset16[2048];
-int GSLocalMemory::rowOffset16S[2048];
-int GSLocalMemory::rowOffset16Z[2048];
-int GSLocalMemory::rowOffset16SZ[2048];
-int GSLocalMemory::rowOffset8[2][2048];
-int GSLocalMemory::rowOffset4[2][2048];
+int GSLocalMemory::rowOffset32[4096];
+int GSLocalMemory::rowOffset32Z[4096];
+int GSLocalMemory::rowOffset16[4096];
+int GSLocalMemory::rowOffset16S[4096];
+int GSLocalMemory::rowOffset16Z[4096];
+int GSLocalMemory::rowOffset16SZ[4096];
+int GSLocalMemory::rowOffset8[2][4096];
+int GSLocalMemory::rowOffset4[2][4096];
 
 short GSLocalMemory::blockOffset32[256];
 short GSLocalMemory::blockOffset32Z[256];
@@ -116,44 +116,44 @@ GSLocalMemory::GSLocalMemory()
 
 	for(int x = 0; x < countof(rowOffset32); x++)
 	{
-		rowOffset32[x] = (int)PixelAddress32(x, 0, 0, 32) - (int)PixelAddress32(0, 0, 0, 32);
+		rowOffset32[x] = (int)PixelAddress32(x & 0x7ff, 0, 0, 32) - (int)PixelAddress32(0, 0, 0, 32);
 	}
 
 	for(int x = 0; x < countof(rowOffset32Z); x++)
 	{
-		rowOffset32Z[x] = (int)PixelAddress32Z(x, 0, 0, 32) - (int)PixelAddress32Z(0, 0, 0, 32);
+		rowOffset32Z[x] = (int)PixelAddress32Z(x & 0x7ff, 0, 0, 32) - (int)PixelAddress32Z(0, 0, 0, 32);
 	}
 
 	for(int x = 0; x < countof(rowOffset16); x++)
 	{
-		rowOffset16[x] = (int)PixelAddress16(x, 0, 0, 32) - (int)PixelAddress16(0, 0, 0, 32);
+		rowOffset16[x] = (int)PixelAddress16(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16(0, 0, 0, 32);
 	}
 
 	for(int x = 0; x < countof(rowOffset16S); x++)
 	{
-		rowOffset16S[x] = (int)PixelAddress16S(x, 0, 0, 32) - (int)PixelAddress16S(0, 0, 0, 32);
+		rowOffset16S[x] = (int)PixelAddress16S(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16S(0, 0, 0, 32);
 	}
 
 	for(int x = 0; x < countof(rowOffset16Z); x++)
 	{
-		rowOffset16Z[x] = (int)PixelAddress16Z(x, 0, 0, 32) - (int)PixelAddress16Z(0, 0, 0, 32);
+		rowOffset16Z[x] = (int)PixelAddress16Z(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16Z(0, 0, 0, 32);
 	}
 
 	for(int x = 0; x < countof(rowOffset16SZ); x++)
 	{
-		rowOffset16SZ[x] = (int)PixelAddress16SZ(x, 0, 0, 32) - (int)PixelAddress16SZ(0, 0, 0, 32);
+		rowOffset16SZ[x] = (int)PixelAddress16SZ(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16SZ(0, 0, 0, 32);
 	}
 
 	for(int x = 0; x < countof(rowOffset8[0]); x++)
 	{
-		rowOffset8[0][x] = (int)PixelAddress8(x, 0, 0, 32) - (int)PixelAddress8(0, 0, 0, 32);
-		rowOffset8[1][x] = (int)PixelAddress8(x, 2, 0, 32) - (int)PixelAddress8(0, 2, 0, 32);
+		rowOffset8[0][x] = (int)PixelAddress8(x & 0x7ff, 0, 0, 32) - (int)PixelAddress8(0, 0, 0, 32);
+		rowOffset8[1][x] = (int)PixelAddress8(x & 0x7ff, 2, 0, 32) - (int)PixelAddress8(0, 2, 0, 32);
 	}
 
 	for(int x = 0; x < countof(rowOffset4[0]); x++)
 	{
-		rowOffset4[0][x] = (int)PixelAddress4(x, 0, 0, 32) - (int)PixelAddress4(0, 0, 0, 32);
-		rowOffset4[1][x] = (int)PixelAddress4(x, 2, 0, 32) - (int)PixelAddress4(0, 2, 0, 32);
+		rowOffset4[0][x] = (int)PixelAddress4(x & 0x7ff, 0, 0, 32) - (int)PixelAddress4(0, 0, 0, 32);
+		rowOffset4[1][x] = (int)PixelAddress4(x & 0x7ff, 2, 0, 32) - (int)PixelAddress4(0, 2, 0, 32);
 	}
 
 	for(int x = 0; x < countof(blockOffset32); x++)
@@ -459,7 +459,7 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm)
 		return i->second;
 	}
 
-	GSOffset* o = (GSOffset*)_aligned_malloc(sizeof(GSOffset), 16);
+	GSOffset* o = (GSOffset*)_aligned_malloc(sizeof(GSOffset), 32);
 
 	o->hash = hash;
 
@@ -474,9 +474,9 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm)
 
 	pixelAddress pa = m_psm[psm].pa;
 
-	for(int i = 0; i < 2048; i++)
+	for(int i = 0; i < 4096; i++)
 	{
-		o->pixel.row[i] = (int)pa(0, i, bp, bw);
+		o->pixel.row[i] = (int)pa(0, i & 0x7ff, bp, bw);
 	}
 
 	for(int i = 0; i < 8; i++)
@@ -513,7 +513,7 @@ GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const G
 		return i->second;
 	}
 
-	GSPixelOffset4* o = (GSPixelOffset4*)_aligned_malloc(sizeof(GSPixelOffset4), 16);
+	GSPixelOffset4* o = (GSPixelOffset4*)_aligned_malloc(sizeof(GSPixelOffset4), 32);
 
 	o->hash = hash;
 
@@ -628,7 +628,7 @@ void GSLocalMemory::WriteImageLeftRight(int l, int r, int y, int h, const uint8*
 template<int psm, int bsx, int bsy, int trbpp>
 void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
 {
-	__aligned16 uint8 buff[64]; // merge buffer for one column
+	__aligned32 uint8 buff[64]; // merge buffer for one column
 
 	uint32 bp = BITBLTBUF.DBP;
 	uint32 bw = BITBLTBUF.DBW;
@@ -1438,7 +1438,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i&
 
 void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
 {
-	__aligned16 uint16 block[16 * 8];
+	__aligned32 uint16 block[16 * 8];
 
 	FOREACH_BLOCK_START(r, 16, 8, 32)
 	{
@@ -1451,7 +1451,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i&
 
 void GSLocalMemory::ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
 {
-	__aligned16 uint16 block[16 * 8];
+	__aligned32 uint16 block[16 * 8];
 
 	FOREACH_BLOCK_START(r, 16, 8, 32)
 	{
@@ -1548,7 +1548,7 @@ void GSLocalMemory::ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i&
 
 void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
 {
-	__aligned16 uint16 block[16 * 8];
+	__aligned32 uint16 block[16 * 8];
 
 	FOREACH_BLOCK_START(r, 16, 8, 32)
 	{
@@ -1561,7 +1561,7 @@ void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i&
 
 void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
 {
-	__aligned16 uint16 block[16 * 8];
+	__aligned32 uint16 block[16 * 8];
 
 	FOREACH_BLOCK_START(r, 16, 8, 32)
 	{
@@ -1576,14 +1576,14 @@ void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i
 
 void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadBlock32<true>(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	if(TEXA.AEM)
 	{
@@ -1597,7 +1597,7 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons
 
 void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	__aligned16 uint16 block[16 * 8];
+	__aligned32 uint16 block[16 * 8];
 
 	ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
 
@@ -1606,7 +1606,7 @@ void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, cons
 
 void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	__aligned16 uint16 block[16 * 8];
+	__aligned32 uint16 block[16 * 8];
 
 	ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
 
@@ -1615,49 +1615,49 @@ void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, con
 
 void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut);
 }
 
 void GSLocalMemory::ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadBlock32<true>(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	if(TEXA.AEM)
 	{
@@ -1671,7 +1671,7 @@ void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, con
 
 void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	__aligned16 uint16 block[16 * 8];
+	__aligned32 uint16 block[16 * 8];
 
 	ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
 
@@ -1680,7 +1680,7 @@ void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, con
 
 void GSLocalMemory::ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	__aligned16 uint16 block[16 * 8];
+	__aligned32 uint16 block[16 * 8];
 
 	ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
 
@@ -1823,28 +1823,28 @@ void GSLocalMemory::ReadTextureBlock8P(uint32 bp, uint8* dst, int dstpitch, cons
 
 void GSLocalMemory::ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadBlock4P(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock8HP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadBlock8HP(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock4HLP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadBlock4HLP(BlockPtr(bp), dst, dstpitch);
 }
 
 void GSLocalMemory::ReadTextureBlock4HHP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
 {
-	ALIGN_STACK(16);
+	ALIGN_STACK(32);
 
 	ReadBlock4HHP(BlockPtr(bp), dst, dstpitch);
 }
@@ -1855,7 +1855,7 @@ HRESULT GSLocalMemory::SaveBMP(const string& fn, uint32 bp, uint32 bw, uint32 ps
 {
 	int pitch = w * 4;
 	int size = pitch * h;
-	void* bits = ::_aligned_malloc(size, 16);
+	void* bits = _aligned_malloc(size, 32);
 
 	GIFRegTEX0 TEX0;
 
diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h
index 4b52190cd9..153db833bc 100644
--- a/plugins/GSdx/GSLocalMemory.h
+++ b/plugins/GSdx/GSLocalMemory.h
@@ -39,7 +39,7 @@ struct GSOffset
 
 	struct
 	{
-		int row[2048]; // yn (n = 0 1 2 ...)
+		int row[4096]; // yn (n = 0 1 2 ...) NOTE: this wraps around above 2048, only transfers should address the upper half (dark cloud 2 inventing)
 		int* col[8]; // rowOffset*
 	} pixel;
 
@@ -116,14 +116,14 @@ protected:
 	static uint32 pageOffset8[32][64][128];
 	static uint32 pageOffset4[32][128][128];
 
-	static int rowOffset32[2048];
-	static int rowOffset32Z[2048];
-	static int rowOffset16[2048];
-	static int rowOffset16S[2048];
-	static int rowOffset16Z[2048];
-	static int rowOffset16SZ[2048];
-	static int rowOffset8[2][2048];
-	static int rowOffset4[2][2048];
+	static int rowOffset32[4096];
+	static int rowOffset32Z[4096];
+	static int rowOffset16[4096];
+	static int rowOffset16S[4096];
+	static int rowOffset16Z[4096];
+	static int rowOffset16SZ[4096];
+	static int rowOffset8[2][4096];
+	static int rowOffset4[2][4096];
 
 	static short blockOffset32[256];
 	static short blockOffset32Z[256];
diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp
index 83df082cb7..78c607ed27 100644
--- a/plugins/GSdx/GSRasterizer.cpp
+++ b/plugins/GSdx/GSRasterizer.cpp
@@ -29,18 +29,20 @@
 // Using a spinning finish on the main (MTGS) thread is apparently a big win still, over trying
 // to wait out all the pending m_finished semaphores.  It leaves one spinwait in the rasterizer,
 // but that's still worlds better than 2-6 spinning threads like before.
-#define UseSpinningFinish		1
+
+#define UseSpinningFinish
 
 // Set this to 1 to remove a lot of non-const div/modulus ops from the rasterization process.
 // Might likely be a measurable speedup but limits threading to 1, 2, 4, and 8 threads.
 // note by rama: Speedup is around 5% on average.
-#define UseConstThreadCount		0
 
-#if UseConstThreadCount
+// #define UseConstThreadCount
+
+#ifdef UseConstThreadCount
 	// ThreadsConst - const number of threads.  User-configured threads (in GSdx panel) must match
 	// this value if UseConstThreadCount is enabled. [yeah, it's hacky for now]
 	static const int ThreadsConst = 2;
-	static const int ThreadMaskConst = ThreadsConst-1;
+	static const int ThreadMaskConst = ThreadsConst - 1;
 #endif
 
 GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads)
@@ -57,11 +59,15 @@ GSRasterizer::~GSRasterizer()
 
 __forceinline bool GSRasterizer::IsOneOfMyScanlines(int scanline) const
 {
-#if UseConstThreadCount
-	return (ThreadMaskConst==0) || ((scanline & ThreadMaskConst) == m_id);
-#else
+	#ifdef UseConstThreadCount
+
+	return ThreadMaskConst == 0 || (scanline & ThreadMaskConst) == m_id;
+
+	#else
+
 	return (scanline % m_threads) == m_id;
-#endif
+
+	#endif
 }
 
 void GSRasterizer::Draw(const GSRasterizerData* data)
@@ -871,7 +877,7 @@ void GSRasterizerMT::ThreadProc()
 {
 	// _mm_setcsr(MXCSR);
 
-	while( true )
+	while(true)
 	{
 		sem_wait(&m_semaphore);
 
@@ -879,10 +885,15 @@ void GSRasterizerMT::ThreadProc()
 
 		__super::Draw(m_data);
 
-		if( UseSpinningFinish )
-			_interlockedbittestandreset( &m_sync, m_id );
-		else
-			sem_post(&m_finished);
+		#ifdef UseSpinningFinish
+		
+		_interlockedbittestandreset(&m_sync, m_id);
+		
+		#else
+		
+		sem_post(&m_finished);
+		
+		#endif
 	}
 
 	sem_post(&m_stopped);
@@ -917,33 +928,36 @@ void GSRasterizerList::Draw(const GSRasterizerData* data)
 
 	m_sync = m_syncstart;
 
-	for(unsigned i=1; i<size(); ++i)
+	for(size_t i = 1; i < size(); i++)
 	{
 		(*this)[i]->Draw(data);
 	}
 
 	(*this)[0]->Draw(data);
 
-	if( UseSpinningFinish )
+	#ifdef UseSpinningFinish
+
+	while(m_sync) _mm_pause();
+
+	#else
+
+	for(size_t i = 1; i < size(); i++)
 	{
-		while(m_sync) _mm_pause();
-	}
-	else
-	{
-		for(unsigned i=1; i<size(); ++i )
-			sem_wait(&m_finished);
+		sem_wait(&m_finished);
 	}
 
+	#endif
+
 	m_stats.ticks = __rdtsc() - start;
 
-	for(unsigned i=0; i<size(); ++i)
+	for(size_t i = 0; i < size(); i++)
 	{
 		GSRasterizerStats s;
 
 		(*this)[i]->GetStats(s);
 
 		m_stats.pixels += s.pixels;
-		m_stats.prims = max(m_stats.prims, s.prims);
+		m_stats.prims = std::max<int>(m_stats.prims, s.prims);
 	}
 }
 
diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h
index 85fd582f2e..75116869cf 100644
--- a/plugins/GSdx/GSRasterizer.h
+++ b/plugins/GSdx/GSRasterizer.h
@@ -30,7 +30,7 @@
 #include "pthread.h"
 #include "semaphore.h"
 
-__aligned16 class GSRasterizerData
+__aligned32 class GSRasterizerData
 {
 public:
 	GSVector4i scissor;
@@ -50,7 +50,7 @@ public:
 	virtual void PrintStats() = 0;
 };
 
-class IDrawScanline : public GSAlignedClass<16>
+class IDrawScanline : public GSAlignedClass<32>
 {
 public:
 	typedef void (__fastcall *DrawScanlineStaticPtr)(int right, int left, int top, const GSVertexSW& v);
@@ -153,9 +153,11 @@ public:
 		push_back(new GSRasterizer(new DS(parent, 0), 0, threads));
 
 		m_syncstart = 0;
+
 		for(int i = 1; i < threads; i++)
 		{
 			push_back(new GSRasterizerMT(new DS(parent, i), i, threads, m_finished, m_sync));
+
 			_interlockedbittestandset(&m_syncstart, i);
 		}
 	}
diff --git a/plugins/GSdx/GSRenderer.cpp b/plugins/GSdx/GSRenderer.cpp
index 539a41ab05..6167f15b0b 100644
--- a/plugins/GSdx/GSRenderer.cpp
+++ b/plugins/GSdx/GSRenderer.cpp
@@ -24,7 +24,7 @@
 
 GSRenderer::GSRenderer()
 	: GSState()
-	, m_tex_buff( (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 16) )
+	, m_tex_buff((uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32))
 	, m_vt(this)
 	, m_dev(NULL)
 	, m_shader(0)
@@ -61,9 +61,10 @@ GSRenderer::~GSRenderer()
 		m_dev->Reset(1, 1, GSDevice::Windowed);
 	}*/
 
-	_aligned_free( m_tex_buff );
+	_aligned_free(m_tex_buff);
 
 	delete m_dev;
+
 	DeleteCriticalSection(&m_pGSsetTitle_Crit);
 }
 
@@ -220,13 +221,6 @@ bool GSRenderer::Merge(int field)
 			r.bottom = r.top + y;
 		}
 
-		// Breaks the blur filter, and actually makes games blurry again.
-		// This might have to do with earlier changes to device size detection.
-		/*if(blurdetected && i == 1)
-		{
-			r += GSVector4i(0, 1).xyxy();
-		}*/
-
 		GSVector4 scale = GSVector4(tex[i]->GetScale()).xyxy();
 
 		src[i] = GSVector4(r) * scale / GSVector4(tex[i]->GetSize()).xyxy();
@@ -380,8 +374,8 @@ void GSRenderer::VSync(int field)
 
 			EnterCriticalSection(&m_pGSsetTitle_Crit);
 
-			strncpy(m_GStitleInfoBuffer, s.c_str(), ArraySize(m_GStitleInfoBuffer)-1);
-			m_GStitleInfoBuffer[sizeof(m_GStitleInfoBuffer)-1] = 0;// make sure null terminated even if text overflows
+			strncpy(m_GStitleInfoBuffer, s.c_str(), countof(m_GStitleInfoBuffer) - 1);
+			m_GStitleInfoBuffer[sizeof(m_GStitleInfoBuffer) - 1] = 0;// make sure null terminated even if text overflows
 
 			LeaveCriticalSection(&m_pGSsetTitle_Crit);
 		}
diff --git a/plugins/GSdx/GSRenderer.h b/plugins/GSdx/GSRenderer.h
index 5662d58d9c..dbe3ac740e 100644
--- a/plugins/GSdx/GSRenderer.h
+++ b/plugins/GSdx/GSRenderer.h
@@ -158,12 +158,13 @@ protected:
 	void GrowVertexBuffer()
 	{
 		m_maxcount = max(10000, m_maxcount * 3/2);
-		m_vertices = (Vertex*)_aligned_realloc(m_vertices, sizeof(Vertex) * m_maxcount, 16);
+		m_vertices = (Vertex*)_aligned_realloc(m_vertices, sizeof(Vertex) * m_maxcount, 32);
 		m_maxcount -= 100;
 	}
 
 	// Returns a pointer to the drawing vertex. Can return NULL!
-	template<uint32 prim> __fi Vertex* BaseDrawingKick(int& count)
+
+	template<uint32 prim> __forceinline Vertex* DrawingKick(bool skip, int& count)
 	{
 		switch(prim)
 		{
@@ -237,7 +238,7 @@ protected:
 			__assume(0);
 		}
 
-		return v;
+		return !skip ? v : NULL;
 	}
 
 	virtual void Draw() = 0;
diff --git a/plugins/GSdx/GSRendererDX.h b/plugins/GSdx/GSRendererDX.h
index 5678e67e0b..af3f4ac1e1 100644
--- a/plugins/GSdx/GSRendererDX.h
+++ b/plugins/GSdx/GSRendererDX.h
@@ -249,7 +249,9 @@ public:
 		ps_sel.clr1 = om_bsel.IsCLR1();
 		ps_sel.fba = context->FBA.FBA;
 		ps_sel.aout = context->FRAME.PSM == PSM_PSMCT16 || context->FRAME.PSM == PSM_PSMCT16S || (context->FRAME.FBMSK & 0xff000000) == 0x7f000000 ? 1 : 0;
+		
 		if (UserHacks_AlphaHack) ps_sel.aout = 1;
+
 		if(PRIM->FGE)
 		{
 			ps_sel.fog = 1;
diff --git a/plugins/GSdx/GSRendererDX11.cpp b/plugins/GSdx/GSRendererDX11.cpp
index a97eaa3d43..cb4e38f0a6 100644
--- a/plugins/GSdx/GSRendererDX11.cpp
+++ b/plugins/GSdx/GSRendererDX11.cpp
@@ -38,20 +38,20 @@ bool GSRendererDX11::CreateDevice(GSDevice* dev)
 	return true;
 }
 
-void GSRendererDX11::DoVertexKick()
+template<uint32 prim, uint32 tme, uint32 fst> 
+void GSRendererDX11::VertexKick(bool skip)
 {
-	const bool tme = PRIM->TME;
-	const bool fst = PRIM->FST;
-
 	GSVertexHW11& dst = m_vl.AddTail();
 
 	dst.vi[0] = m_v.vi[0];
 	dst.vi[1] = m_v.vi[1];
 
 #ifdef USE_UPSCALE_HACKS
+
 	if(tme && fst)
 	{
 		//GSVector4::storel(&dst.ST, m_v.GetUV());
+
 		int Udiff = 0;
 		int Vdiff = 0;
 		int Uadjust = 0;
@@ -95,6 +95,7 @@ void GSRendererDX11::DoVertexKick()
 				else if	(Vdiff <= 1)	{ Vadjust = 1; }
 			}
 		}
+
 		dst.ST.S = (float)m_v.UV.U - Uadjust;
 		dst.ST.T = (float)m_v.UV.V - Vadjust;
 	}
@@ -104,104 +105,103 @@ void GSRendererDX11::DoVertexKick()
 		//dst.XYZ.X += 5;
 		//dst.XYZ.Y += 5;
 	}
+
 #else
+
 	if(tme && fst)
 	{
 		GSVector4::storel(&dst.ST, m_v.GetUV());
 	}
+
 #endif
-}
 
-template< uint32 prim >
-void GSRendererDX11::DrawingKick( bool skip )
-{
-	int count;
-
-	GSVertexHW11* v = BaseDrawingKick<prim>(count);
-	if (skip || !v) return;
-
-	GSVector4i scissor = m_context->scissor.dx10;
-
-	GSVector4i pmin, pmax;
-
-	#if _M_SSE >= 0x401
-
-	GSVector4i v0, v1, v2;
-
-	switch(prim)
+	int count = 0;
+	
+	if(GSVertexHW11* v = DrawingKick<prim>(skip, count))
 	{
-	case GS_POINTLIST:
-		v0 = GSVector4i::load((int)v[0].p.xy).upl16();
-		pmin = v0;
-		pmax = v0;
-		break;
-	case GS_LINELIST:
-	case GS_LINESTRIP:
-	case GS_SPRITE:
-		v0 = GSVector4i::load((int)v[0].p.xy);
-		v1 = GSVector4i::load((int)v[1].p.xy);
-		pmin = v0.min_u16(v1).upl16();
-		pmax = v0.max_u16(v1).upl16();
-		break;
-	case GS_TRIANGLELIST:
-	case GS_TRIANGLESTRIP:
-	case GS_TRIANGLEFAN:
-		v0 = GSVector4i::load((int)v[0].p.xy);
-		v1 = GSVector4i::load((int)v[1].p.xy);
-		v2 = GSVector4i::load((int)v[2].p.xy);
-		pmin = v0.min_u16(v1).min_u16(v2).upl16();
-		pmax = v0.max_u16(v1).max_u16(v2).upl16();
-		break;
+		GSVector4i scissor = m_context->scissor.dx10;
+
+		GSVector4i pmin, pmax;
+
+		#if _M_SSE >= 0x401
+
+		GSVector4i v0, v1, v2;
+
+		switch(prim)
+		{
+		case GS_POINTLIST:
+			v0 = GSVector4i::load((int)v[0].p.xy).upl16();
+			pmin = v0;
+			pmax = v0;
+			break;
+		case GS_LINELIST:
+		case GS_LINESTRIP:
+		case GS_SPRITE:
+			v0 = GSVector4i::load((int)v[0].p.xy);
+			v1 = GSVector4i::load((int)v[1].p.xy);
+			pmin = v0.min_u16(v1).upl16();
+			pmax = v0.max_u16(v1).upl16();
+			break;
+		case GS_TRIANGLELIST:
+		case GS_TRIANGLESTRIP:
+		case GS_TRIANGLEFAN:
+			v0 = GSVector4i::load((int)v[0].p.xy);
+			v1 = GSVector4i::load((int)v[1].p.xy);
+			v2 = GSVector4i::load((int)v[2].p.xy);
+			pmin = v0.min_u16(v1).min_u16(v2).upl16();
+			pmax = v0.max_u16(v1).max_u16(v2).upl16();
+			break;
+		}
+
+		#else
+
+		switch(prim)
+		{
+		case GS_POINTLIST:
+			pmin.x = v[0].p.x;
+			pmin.y = v[0].p.y;
+			pmax.x = v[0].p.x;
+			pmax.y = v[0].p.y;
+			break;
+		case GS_LINELIST:
+		case GS_LINESTRIP:
+		case GS_SPRITE:
+			pmin.x = std::min<uint16>(v[0].p.x, v[1].p.x);
+			pmin.y = std::min<uint16>(v[0].p.y, v[1].p.y);
+			pmax.x = std::max<uint16>(v[0].p.x, v[1].p.x);
+			pmax.y = std::max<uint16>(v[0].p.y, v[1].p.y);
+			break;
+		case GS_TRIANGLELIST:
+		case GS_TRIANGLESTRIP:
+		case GS_TRIANGLEFAN:
+			pmin.x = std::min<uint16>(std::min<uint16>(v[0].p.x, v[1].p.x), v[2].p.x);
+			pmin.y = std::min<uint16>(std::min<uint16>(v[0].p.y, v[1].p.y), v[2].p.y);
+			pmax.x = std::max<uint16>(std::max<uint16>(v[0].p.x, v[1].p.x), v[2].p.x);
+			pmax.y = std::max<uint16>(std::max<uint16>(v[0].p.y, v[1].p.y), v[2].p.y);
+			break;
+		}
+
+		#endif
+
+		GSVector4i test = (pmax < scissor) | (pmin > scissor.zwxy());
+
+		switch(prim)
+		{
+		case GS_TRIANGLELIST:
+		case GS_TRIANGLESTRIP:
+		case GS_TRIANGLEFAN:
+		case GS_SPRITE:
+			test |= pmin == pmax;
+			break;
+		}
+
+		if(test.mask() & 0xff)
+		{
+			return;
+		}
+
+		m_count += count;
 	}
-
-	#else
-
-	switch(prim)
-	{
-	case GS_POINTLIST:
-		pmin.x = v[0].p.x;
-		pmin.y = v[0].p.y;
-		pmax.x = v[0].p.x;
-		pmax.y = v[0].p.y;
-		break;
-	case GS_LINELIST:
-	case GS_LINESTRIP:
-	case GS_SPRITE:
-		pmin.x = std::min<uint16>(v[0].p.x, v[1].p.x);
-		pmin.y = std::min<uint16>(v[0].p.y, v[1].p.y);
-		pmax.x = std::max<uint16>(v[0].p.x, v[1].p.x);
-		pmax.y = std::max<uint16>(v[0].p.y, v[1].p.y);
-		break;
-	case GS_TRIANGLELIST:
-	case GS_TRIANGLESTRIP:
-	case GS_TRIANGLEFAN:
-		pmin.x = std::min<uint16>(std::min<uint16>(v[0].p.x, v[1].p.x), v[2].p.x);
-		pmin.y = std::min<uint16>(std::min<uint16>(v[0].p.y, v[1].p.y), v[2].p.y);
-		pmax.x = std::max<uint16>(std::max<uint16>(v[0].p.x, v[1].p.x), v[2].p.x);
-		pmax.y = std::max<uint16>(std::max<uint16>(v[0].p.y, v[1].p.y), v[2].p.y);
-		break;
-	}
-
-	#endif
-
-	GSVector4i test = (pmax < scissor) | (pmin > scissor.zwxy());
-
-	switch(prim)
-	{
-	case GS_TRIANGLELIST:
-	case GS_TRIANGLESTRIP:
-	case GS_TRIANGLEFAN:
-	case GS_SPRITE:
-		test |= pmin == pmax;
-		break;
-	}
-
-	if(test.mask() & 0xff)
-	{
-		return;
-	}
-
-	m_count += count;
 }
 
 void GSRendererDX11::Draw(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)
diff --git a/plugins/GSdx/GSRendererDX11.h b/plugins/GSdx/GSRendererDX11.h
index 81c1c54663..fc7f8a0337 100644
--- a/plugins/GSdx/GSRendererDX11.h
+++ b/plugins/GSdx/GSRendererDX11.h
@@ -36,8 +36,5 @@ public:
 
 	bool CreateDevice(GSDevice* dev);
 
-	template<uint32 prim>
-	void DrawingKick( bool skip );
-
-	void DoVertexKick();
+	template<uint32 prim, uint32 tme, uint32 fst> void VertexKick(bool skip);
 };
diff --git a/plugins/GSdx/GSRendererDX9.cpp b/plugins/GSdx/GSRendererDX9.cpp
index 2d03de03bc..a6e7fd0d9a 100644
--- a/plugins/GSdx/GSRendererDX9.cpp
+++ b/plugins/GSdx/GSRendererDX9.cpp
@@ -57,11 +57,9 @@ bool GSRendererDX9::CreateDevice(GSDevice* dev)
 	return true;
 }
 
-void GSRendererDX9::DoVertexKick()
+template<uint32 prim, uint32 tme, uint32 fst> 
+void GSRendererDX9::VertexKick(bool skip)
 {
-	const bool tme = PRIM->TME;
-	const bool fst = PRIM->FST;
-
 	GSVertexHW9& dst = m_vl.AddTail();
 
 	dst.p = GSVector4(((GSVector4i)m_v.XYZ).upl16());
@@ -142,92 +140,90 @@ void GSRendererDX9::DoVertexKick()
 
 	dst.c0 = m_v.RGBAQ.u32[0];
 	dst.c1 = m_v.FOG.u32[1];
-}
 
-template< uint32 prim >
-void GSRendererDX9::DrawingKick( bool skip )
-{
-	int count;
+	//
 
 	// BaseDrawingKick can never return NULL here because the DrawingKick function
 	// tables route to DrawingKickNull for GS_INVLALID prim types (and that's the only
 	// condition where this function would return NULL).
 
-	GSVertexHW9* v = BaseDrawingKick<prim>(count);
-	if (skip || !v) return;
-
-	GSVector4 scissor = m_context->scissor.dx9;
-
-	GSVector4 pmin, pmax;
-
-	switch(prim)
+	int count = 0;
+	
+	if(GSVertexHW9* v = DrawingKick<prim>(skip, count))
 	{
-	case GS_POINTLIST:
-		pmin = v[0].p;
-		pmax = v[0].p;
-		break;
-	case GS_LINELIST:
-	case GS_LINESTRIP:
-	case GS_SPRITE:
-		pmin = v[0].p.min(v[1].p);
-		pmax = v[0].p.max(v[1].p);
-		break;
-	case GS_TRIANGLELIST:
-	case GS_TRIANGLESTRIP:
-	case GS_TRIANGLEFAN:
-		pmin = v[0].p.min(v[1].p).min(v[2].p);
-		pmax = v[0].p.max(v[1].p).max(v[2].p);
-		break;
+		GSVector4 scissor = m_context->scissor.dx9;
+
+		GSVector4 pmin, pmax;
+
+		switch(prim)
+		{
+		case GS_POINTLIST:
+			pmin = v[0].p;
+			pmax = v[0].p;
+			break;
+		case GS_LINELIST:
+		case GS_LINESTRIP:
+		case GS_SPRITE:
+			pmin = v[0].p.min(v[1].p);
+			pmax = v[0].p.max(v[1].p);
+			break;
+		case GS_TRIANGLELIST:
+		case GS_TRIANGLESTRIP:
+		case GS_TRIANGLEFAN:
+			pmin = v[0].p.min(v[1].p).min(v[2].p);
+			pmax = v[0].p.max(v[1].p).max(v[2].p);
+			break;
+		}
+
+		GSVector4 test = (pmax < scissor) | (pmin > scissor.zwxy());
+
+		switch(prim)
+		{
+		case GS_TRIANGLELIST:
+		case GS_TRIANGLESTRIP:
+		case GS_TRIANGLEFAN:
+		case GS_SPRITE:
+			test |= pmin == pmax;
+			break;
+		}
+
+		if(test.mask() & 3)
+		{
+			return;
+		}
+
+		switch(prim)
+		{
+		case GS_POINTLIST:
+			break;
+		case GS_LINELIST:
+		case GS_LINESTRIP:
+			if(PRIM->IIP == 0) {v[0].c0 = v[1].c0;}
+			break;
+		case GS_TRIANGLELIST:
+		case GS_TRIANGLESTRIP:
+		case GS_TRIANGLEFAN:
+			if(PRIM->IIP == 0) {v[0].c0 = v[1].c0 = v[2].c0;}
+			break;
+		case GS_SPRITE:
+			if(PRIM->IIP == 0) {v[0].c0 = v[1].c0;}
+			v[0].p.z = v[1].p.z;
+			v[0].p.w = v[1].p.w;
+			v[0].c1 = v[1].c1;
+			v[2] = v[1];
+			v[3] = v[1];
+			v[1].p.y = v[0].p.y;
+			v[1].t.y = v[0].t.y;
+			v[2].p.x = v[0].p.x;
+			v[2].t.x = v[0].t.x;
+			v[4] = v[1];
+			v[5] = v[2];
+			count += 4;
+			break;
+		}
+
+		m_count += count;
 	}
-
-	GSVector4 test = (pmax < scissor) | (pmin > scissor.zwxy());
-
-	switch(prim)
-	{
-	case GS_TRIANGLELIST:
-	case GS_TRIANGLESTRIP:
-	case GS_TRIANGLEFAN:
-	case GS_SPRITE:
-		test |= pmin == pmax;
-		break;
-	}
-
-	if(test.mask() & 3)
-	{
-		return;
-	}
-
-	switch(prim)
-	{
-	case GS_POINTLIST:
-		break;
-	case GS_LINELIST:
-	case GS_LINESTRIP:
-		if(PRIM->IIP == 0) {v[0].c0 = v[1].c0;}
-		break;
-	case GS_TRIANGLELIST:
-	case GS_TRIANGLESTRIP:
-	case GS_TRIANGLEFAN:
-		if(PRIM->IIP == 0) {v[0].c0 = v[1].c0 = v[2].c0;}
-		break;
-	case GS_SPRITE:
-		if(PRIM->IIP == 0) {v[0].c0 = v[1].c0;}
-		v[0].p.z = v[1].p.z;
-		v[0].p.w = v[1].p.w;
-		v[0].c1 = v[1].c1;
-		v[2] = v[1];
-		v[3] = v[1];
-		v[1].p.y = v[0].p.y;
-		v[1].t.y = v[0].t.y;
-		v[2].p.x = v[0].p.x;
-		v[2].t.x = v[0].t.x;
-		v[4] = v[1];
-		v[5] = v[2];
-		count += 4;
-		break;
-	}
-
-	m_count += count;
 }
 
 void GSRendererDX9::Draw(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)
diff --git a/plugins/GSdx/GSRendererDX9.h b/plugins/GSdx/GSRendererDX9.h
index 4483c2c9d9..f4d8eb73a8 100644
--- a/plugins/GSdx/GSRendererDX9.h
+++ b/plugins/GSdx/GSRendererDX9.h
@@ -43,8 +43,5 @@ public:
 
 	bool CreateDevice(GSDevice* dev);
 
-	template<uint32 prim>
-	void DrawingKick( bool skip );
-
-	void DoVertexKick();
+	template<uint32 prim, uint32 tme, uint32 fst> void VertexKick(bool skip);
 };
diff --git a/plugins/GSdx/GSRendererNull.h b/plugins/GSdx/GSRendererNull.h
index 845965e5fd..9fbc84e179 100644
--- a/plugins/GSdx/GSRendererNull.h
+++ b/plugins/GSdx/GSRendererNull.h
@@ -43,10 +43,7 @@ public:
 		InitVertexKick<GSRendererNull>();
 	}
 
-	virtual ~GSRendererNull() {}
-
-	template<uint32 prim>
-	void DrawingKick( bool skip ) {}
-
-	void DoVertexKick() {}
+	template<uint32 prim, uint32 tme, uint32 fst> void VertexKick(bool skip)
+	{
+	}
 };
diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp
index 80ee90df54..2abe02adf1 100644
--- a/plugins/GSdx/GSRendererSW.cpp
+++ b/plugins/GSdx/GSRendererSW.cpp
@@ -94,6 +94,7 @@ GSTexture* GSRendererSW::GetOutput(int i)
 	if(m_dev->ResizeTexture(&m_texture[i], w, h))
 	{
 		uint8* buff = GetTextureBufferLock();
+
 		static int pitch = 1024 * 4;
 
 		GSVector4i r(0, 0, w, h);
@@ -113,6 +114,7 @@ GSTexture* GSRendererSW::GetOutput(int i)
 
 			s_n++;
 		}
+
 		ReleaseTextureBufferLock();
 	}
 
@@ -427,24 +429,22 @@ void GSRendererSW::GetScanlineParam(GSScanlineParam& p, GS_PRIM_CLASS primclass)
 	}
 }
 
-void GSRendererSW::DoVertexKick()
+template<uint32 prim, uint32 tme, uint32 fst> 
+void GSRendererSW::VertexKick(bool skip)
 {
-	const bool tme = PRIM->TME;
-	const bool fst = PRIM->FST;
-
-	const GSDrawingContext& context = *m_context;
+	const GSDrawingContext* context = m_context;
 
 	GSVector4i xy = GSVector4i::load((int)m_v.XYZ.u32[0]);
-
+	
 	xy = xy.insert16<3>(m_v.FOG.F);
 	xy = xy.upl16();
-	xy -= context.XYOFFSET;
+	xy -= context->XYOFFSET;
 
-	GSVertexSW& dst = m_vl.AddTail();
+	GSVertexSW v;
 
-	dst.p = GSVector4(xy) * g_pos_scale;
+	v.p = GSVector4(xy) * g_pos_scale;
 
-	dst.c = GSVector4(GSVector4i::load((int)m_v.RGBAQ.u32[0]).u8to32() << 7);
+	v.c = GSVector4(GSVector4i::load((int)m_v.RGBAQ.u32[0]).u8to32() << 7);
 
 	if(tme)
 	{
@@ -452,37 +452,31 @@ void GSRendererSW::DoVertexKick()
 
 		if(fst)
 		{
-			dst.t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4));
+			v.t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4));
 			q = 1.0f;
 		}
 		else
 		{
-			dst.t = GSVector4(m_v.ST.S, m_v.ST.T);
-			dst.t *= GSVector4(0x10000 << context.TEX0.TW, 0x10000 << context.TEX0.TH);
+			v.t = GSVector4(m_v.ST.S, m_v.ST.T);
+			v.t *= GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH);
 			q = m_v.RGBAQ.Q;
 		}
 
-		dst.t = dst.t.xyxy(GSVector4::load(q));
+		v.t = v.t.xyxy(GSVector4::load(q));
 	}
 
+	GSVertexSW& dst = m_vl.AddTail();
+
+	dst = v;
+
 	dst.p.z = (float)min(m_v.XYZ.Z, 0xffffff00); // max value which can survive the uint32 => float => uint32 conversion
-}
 
-
-template< uint32 prim >
-void GSRendererSW::DrawingKick( bool skip )
-{
-	int count;
-
-	// BaseDrawingKick can never return NULL here because the DrawingKick function
-	// tables route to DrawingKickNull for GS_INVLALID prim types (and that's the only
-	// condition where this function would return NULL).
-
-	GSVertexSW* v = BaseDrawingKick<prim>(count);
-	if (skip || !v) return;
-
-	if(!m_dump)
+	int count = 0;
+	
+	if(GSVertexSW* v = DrawingKick<prim>(skip, count))
 	{
+if(!m_dump)
+{
 		GSVector4 pmin, pmax;
 
 		switch(prim)
@@ -505,7 +499,7 @@ void GSRendererSW::DrawingKick( bool skip )
 			break;
 		}
 
-		GSVector4 scissor = m_context->scissor.ex;
+		GSVector4 scissor = context->scissor.ex;
 
 		GSVector4 test = (pmax < scissor) | (pmin > scissor.zwxy());
 
@@ -529,77 +523,77 @@ void GSRendererSW::DrawingKick( bool skip )
 			test |= tmp == tmp.yxwz();
 			break;
 		}
-
+		
 		if(test.mask() & 3)
 		{
 			return;
 		}
-	}
-
-	switch(prim)
-	{
-	case GS_POINTLIST:
-		break;
-	case GS_LINELIST:
-	case GS_LINESTRIP:
-		if(PRIM->IIP == 0) {v[0].c = v[1].c;}
-		break;
-	case GS_TRIANGLELIST:
-	case GS_TRIANGLESTRIP:
-	case GS_TRIANGLEFAN:
-		if(PRIM->IIP == 0) {v[0].c = v[2].c; v[1].c = v[2].c;}
-		break;
-	case GS_SPRITE:
-		break;
-	}
-
-	if(m_count < 30 && m_count >= 3)
-	{
-		GSVertexSW* v = &m_vertices[m_count - 3];
-
-		int tl = 0;
-		int br = 0;
-
-		bool isquad = false;
-
+}
 		switch(prim)
 		{
+		case GS_POINTLIST:
+			break;
+		case GS_LINELIST:
+		case GS_LINESTRIP:
+			if(PRIM->IIP == 0) {v[0].c = v[1].c;}
+			break;
+		case GS_TRIANGLELIST:
 		case GS_TRIANGLESTRIP:
 		case GS_TRIANGLEFAN:
-		case GS_TRIANGLELIST:
-			isquad = GSVertexSW::IsQuad(v, tl, br);
+			if(PRIM->IIP == 0) {v[0].c = v[2].c; v[1].c = v[2].c;}
+			break;
+		case GS_SPRITE:
 			break;
 		}
 
-		if(isquad)
+		if(m_count < 30 && m_count >= 3)
 		{
-			m_count -= 3;
+			GSVertexSW* v = &m_vertices[m_count - 3];
 
-			if(m_count > 0)
+			int tl = 0;
+			int br = 0;
+
+			bool isquad = false;
+
+			switch(prim)
 			{
-				tl += m_count;
-				br += m_count;
-
-				Flush();
+			case GS_TRIANGLESTRIP:
+			case GS_TRIANGLEFAN:
+			case GS_TRIANGLELIST:
+				isquad = GSVertexSW::IsQuad(v, tl, br);
+				break;
 			}
 
-			if(tl != 0) m_vertices[0] = m_vertices[tl];
-			if(br != 1) m_vertices[1] = m_vertices[br];
+			if(isquad)
+			{
+				m_count -= 3;
 
-			m_count = 2;
+				if(m_count > 0)
+				{
+					tl += m_count;
+					br += m_count;
 
-			uint32 tmp = PRIM->PRIM;
-			PRIM->PRIM = GS_SPRITE;
+					Flush();
+				}
 
-			Flush();
+				if(tl != 0) m_vertices[0] = m_vertices[tl];
+				if(br != 1) m_vertices[1] = m_vertices[br];
 
-			PRIM->PRIM = tmp;
+				m_count = 2;
 
-			m_perfmon.Put(GSPerfMon::Quad, 1);
+				uint32 tmp = PRIM->PRIM;
+				PRIM->PRIM = GS_SPRITE;
 
-			return;
+				Flush();
+
+				PRIM->PRIM = tmp;
+
+				m_perfmon.Put(GSPerfMon::Quad, 1);
+
+				return;
+			}
 		}
-	}
 
-	m_count += count;
+		m_count += count;
+	}
 }
diff --git a/plugins/GSdx/GSRendererSW.h b/plugins/GSdx/GSRendererSW.h
index 15a3fe07a1..abe8b60ee0 100644
--- a/plugins/GSdx/GSRendererSW.h
+++ b/plugins/GSdx/GSRendererSW.h
@@ -47,13 +47,6 @@ public:
 	GSRendererSW();
 	virtual ~GSRendererSW();
 
-	template<uint32 prim>
-	void DrawingKick( bool skip );
-
-	void DoVertexKick();
-
-	void InvalidateTextureCache()
-	{
-		m_tc->RemoveAll();
-	}
+	template<uint32 prim, uint32 tme, uint32 fst> 
+	void VertexKick(bool skip);
 };
diff --git a/plugins/GSdx/GSScanlineEnvironment.h b/plugins/GSdx/GSScanlineEnvironment.h
index 94c9f7011b..d2d1d78685 100644
--- a/plugins/GSdx/GSScanlineEnvironment.h
+++ b/plugins/GSdx/GSScanlineEnvironment.h
@@ -99,7 +99,7 @@ union GSScanlineSelector
 	}
 };
 
-__aligned16 struct GSScanlineParam
+__aligned32 struct GSScanlineParam
 {
 	GSScanlineSelector sel;
 
@@ -115,7 +115,7 @@ __aligned16 struct GSScanlineParam
 	uint32 fm, zm;
 };
 
-__aligned16 struct GSScanlineEnvironment
+__aligned32 struct GSScanlineEnvironment
 {
 	void* vm;
 	const void* tex;
diff --git a/plugins/GSdx/GSSettingsDlg.cpp b/plugins/GSdx/GSSettingsDlg.cpp
index 38d7e9fcef..3ed3b1a4dd 100644
--- a/plugins/GSdx/GSSettingsDlg.cpp
+++ b/plugins/GSdx/GSSettingsDlg.cpp
@@ -88,7 +88,9 @@ void GSSettingsDlg::OnInit()
 		ComboBoxAppend(IDC_RESOLUTION, "Please select...", (LPARAM)&m_modes.back(), true);
 
 		CComPtr<IDirect3D9> d3d;
+
 		d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
+		
 		if(d3d)
 		{
 			uint32 w = theApp.GetConfig("ModeWidth", 0);
@@ -151,10 +153,13 @@ void GSSettingsDlg::OnInit()
 	SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_SETRANGE, 0, MAKELPARAM(8192, 256));
 	SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_SETPOS, 0, MAKELPARAM(theApp.GetConfig("resy", 1024), 0));
 
-	int r=theApp.GetConfig("Renderer", 0);
-	if (r>=0 && r<=2){//DX9
+	int r = theApp.GetConfig("Renderer", 0);
+
+	if(r >= 0 && r <= 2) // DX9
+	{
 		GSDevice9::ForceValidMsaaConfig();
-		m_lastValidMsaa=theApp.GetConfig("msaa", 0);
+
+		m_lastValidMsaa = theApp.GetConfig("msaa", 0);
 	}
 
 	SendMessage(GetDlgItem(m_hWnd, IDC_MSAA), UDM_SETRANGE, 0, MAKELPARAM(16, 0));
diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.cpp
index f6f48f6b77..d4a8bd3ae4 100644
--- a/plugins/GSdx/GSSetupPrimCodeGenerator.cpp
+++ b/plugins/GSdx/GSSetupPrimCodeGenerator.cpp
@@ -48,7 +48,14 @@ void GSSetupPrimCodeGenerator::Generate()
 	{
 		for(int i = 0; i < 5; i++)
 		{
-			movaps(Xmm(3 + i), xmmword[&m_shift[i]]);
+			if(m_cpu.has(util::Cpu::tAVX))
+			{
+				vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
+			}
+			else
+			{
+				movaps(Xmm(3 + i), ptr[&m_shift[i]]);
+			}
 		}
 	}
 
@@ -68,113 +75,221 @@ void GSSetupPrimCodeGenerator::Depth()
 		return;
 	}
 
-	if(!m_sel.sprite)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		// GSVector4 t = dscan.p;
-
-		movaps(xmm0, xmmword[edx + 16]);
-
-		if(m_en.f)
+		if(!m_sel.sprite)
 		{
-			// GSVector4 df = p.wwww();
+			// GSVector4 t = dscan.p;
 
-			movaps(xmm1, xmm0);
-			shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
+			vmovaps(xmm0, ptr[edx + 16]);
 
-			// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();
-
-			movaps(xmm2, xmm1);
-			mulps(xmm2, xmm3);
-			cvttps2dq(xmm2, xmm2);
-			pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
-			pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
-			movdqa(xmmword[&m_env.d4.f], xmm2);
-
-			for(int i = 0; i < 4; i++)
+			if(m_en.f)
 			{
-				// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
+				// GSVector4 df = p.wwww();
 
-				movaps(xmm2, xmm1);
-				mulps(xmm2, Xmm(4 + i));
-				cvttps2dq(xmm2, xmm2);
-				pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
-				pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
-				movdqa(xmmword[&m_env.d[i].f], xmm2);
+				vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+
+				// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();
+
+				vmulps(xmm2, xmm1, xmm3);
+				vcvttps2dq(xmm2, xmm2);
+				vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+				vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+				vmovdqa(ptr[&m_env.d4.f], xmm2);
+
+				for(int i = 0; i < 4; i++)
+				{
+					// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
+
+					vmulps(xmm2, xmm1, Xmm(4 + i));
+					vcvttps2dq(xmm2, xmm2);
+					vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+					vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+					vmovdqa(ptr[&m_env.d[i].f], xmm2);
+				}
+			}
+
+			if(m_en.z)
+			{
+				// GSVector4 dz = p.zzzz();
+
+				vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+				// m_env.d4.z = dz * 4.0f;
+
+				vmulps(xmm1, xmm0, xmm3);
+				vmovdqa(ptr[&m_env.d4.z], xmm1);
+
+				for(int i = 0; i < 4; i++)
+				{
+					// m_env.d[i].z = dz * m_shift[i];
+
+					vmulps(xmm1, xmm0, Xmm(4 + i));
+					vmovdqa(ptr[&m_env.d[i].z], xmm1);
+				}
 			}
 		}
-
-		if(m_en.z)
+		else
 		{
-			// GSVector4 dz = p.zzzz();
+			// GSVector4 p = vertices[0].p;
 
-			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+			vmovaps(xmm0, ptr[ecx + 16]);
 
-			// m_env.d4.z = dz * 4.0f;
-
-			movaps(xmm1, xmm0);
-			mulps(xmm1, xmm3);
-			movdqa(xmmword[&m_env.d4.z], xmm1);
-
-			for(int i = 0; i < 4; i++)
+			if(m_en.f)
 			{
-				// m_env.d[i].z = dz * m_shift[i];
+				// m_env.p.f = GSVector4i(p).zzzzh().zzzz();
 
-				movaps(xmm1, xmm0);
-				mulps(xmm1, Xmm(4 + i));
-				movdqa(xmmword[&m_env.d[i].z], xmm1);
+				vcvttps2dq(xmm1, xmm0);
+				vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+				vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+				vmovdqa(ptr[&m_env.p.f], xmm1);
+			}
+
+			if(m_en.z)
+			{
+				// GSVector4 z = p.zzzz();
+
+				vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+				if(m_sel.zoverflow)
+				{
+					// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+
+					static const float half = 0.5f;
+
+					vmovss(xmm1, dword[&half]);
+					vshufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
+					vmulps(xmm1, xmm0);
+					vcvttps2dq(xmm1, xmm1);
+					vpslld(xmm1, 1);
+
+					vcvttps2dq(xmm0, xmm0);
+					vpcmpeqd(xmm2, xmm2);
+					vpsrld(xmm2, 31);
+					vpand(xmm0, xmm2);
+
+					vpor(xmm0, xmm1);
+				}
+				else
+				{
+					// m_env.p.z = GSVector4i(z);
+
+					vcvttps2dq(xmm0, xmm0);
+				}
+
+				vmovdqa(ptr[&m_env.p.z], xmm0);
 			}
 		}
 	}
 	else
 	{
-		// GSVector4 p = vertices[0].p;
-
-		movaps(xmm0, xmmword[ecx + 16]);
-
-		if(m_en.f)
+		if(!m_sel.sprite)
 		{
-			// m_env.p.f = GSVector4i(p).zzzzh().zzzz();
+			// GSVector4 t = dscan.p;
 
-			movaps(xmm1, xmm0);
-			cvttps2dq(xmm1, xmm1);
-			pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
-			pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
-			movdqa(xmmword[&m_env.p.f], xmm1);
+			movaps(xmm0, ptr[edx + 16]);
+
+			if(m_en.f)
+			{
+				// GSVector4 df = p.wwww();
+
+				movaps(xmm1, xmm0);
+				shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
+
+				// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();
+
+				movaps(xmm2, xmm1);
+				mulps(xmm2, xmm3);
+				cvttps2dq(xmm2, xmm2);
+				pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+				pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+				movdqa(ptr[&m_env.d4.f], xmm2);
+
+				for(int i = 0; i < 4; i++)
+				{
+					// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
+
+					movaps(xmm2, xmm1);
+					mulps(xmm2, Xmm(4 + i));
+					cvttps2dq(xmm2, xmm2);
+					pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+					pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+					movdqa(ptr[&m_env.d[i].f], xmm2);
+				}
+			}
+
+			if(m_en.z)
+			{
+				// GSVector4 dz = p.zzzz();
+
+				shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+				// m_env.d4.z = dz * 4.0f;
+
+				movaps(xmm1, xmm0);
+				mulps(xmm1, xmm3);
+				movdqa(ptr[&m_env.d4.z], xmm1);
+
+				for(int i = 0; i < 4; i++)
+				{
+					// m_env.d[i].z = dz * m_shift[i];
+
+					movaps(xmm1, xmm0);
+					mulps(xmm1, Xmm(4 + i));
+					movdqa(ptr[&m_env.d[i].z], xmm1);
+				}
+			}
 		}
-
-		if(m_en.z)
+		else
 		{
-			// GSVector4 z = p.zzzz();
+			// GSVector4 p = vertices[0].p;
 
-			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+			movaps(xmm0, ptr[ecx + 16]);
 
-			if(m_sel.zoverflow)
+			if(m_en.f)
 			{
-				// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+				// m_env.p.f = GSVector4i(p).zzzzh().zzzz();
 
-				static const float half = 0.5f;
-
-				movss(xmm1, dword[&half]);
-				shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
-				mulps(xmm1, xmm0);
-				cvttps2dq(xmm1, xmm1);
-				pslld(xmm1, 1);
-
-				cvttps2dq(xmm0, xmm0);
-				pcmpeqd(xmm2, xmm2);
-				psrld(xmm2, 31);
-				pand(xmm0, xmm2);
-
-				por(xmm0, xmm1);
-			}
-			else
-			{
-				// m_env.p.z = GSVector4i(z);
-
-				cvttps2dq(xmm0, xmm0);
+				cvttps2dq(xmm1, xmm0);
+				pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+				pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+				movdqa(ptr[&m_env.p.f], xmm1);
 			}
 
-			movdqa(xmmword[&m_env.p.z], xmm0);
+			if(m_en.z)
+			{
+				// GSVector4 z = p.zzzz();
+
+				shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+				if(m_sel.zoverflow)
+				{
+					// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+
+					static const float half = 0.5f;
+
+					movss(xmm1, dword[&half]);
+					shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
+					mulps(xmm1, xmm0);
+					cvttps2dq(xmm1, xmm1);
+					pslld(xmm1, 1);
+
+					cvttps2dq(xmm0, xmm0);
+					pcmpeqd(xmm2, xmm2);
+					psrld(xmm2, 31);
+					pand(xmm0, xmm2);
+
+					por(xmm0, xmm1);
+				}
+				else
+				{
+					// m_env.p.z = GSVector4i(z);
+
+					cvttps2dq(xmm0, xmm0);
+				}
+
+				movdqa(ptr[&m_env.p.z], xmm0);
+			}
 		}
 	}
 }
@@ -186,64 +301,129 @@ void GSSetupPrimCodeGenerator::Texture()
 		return;
 	}
 
-	// GSVector4 t = dscan.t;
-
-	movaps(xmm0, xmmword[edx + 32]);
-
-	movaps(xmm1, xmm0);
-	mulps(xmm1, xmm3);
-
-	if(m_sel.fst)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		// m_env.d4.st = GSVector4i(t * 4.0f);
+		// GSVector4 t = dscan.t;
 
-		cvttps2dq(xmm1, xmm1);
-		movdqa(xmmword[&m_env.d4.st], xmm1);
+		vmovaps(xmm0, ptr[edx + 32]);
+
+		vmulps(xmm1, xmm0, xmm3);
+
+		if(m_sel.fst)
+		{
+			// m_env.d4.st = GSVector4i(t * 4.0f);
+
+			vcvttps2dq(xmm1, xmm1);
+			vmovdqa(ptr[&m_env.d4.st], xmm1);
+		}
+		else
+		{
+			// m_env.d4.stq = t * 4.0f;
+
+			vmovaps(ptr[&m_env.d4.stq], xmm1);
+		}
+
+		for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
+		{
+			// GSVector4 ds = t.xxxx();
+			// GSVector4 dt = t.yyyy();
+			// GSVector4 dq = t.zzzz();
+
+			vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
+
+			for(int i = 0; i < 4; i++)
+			{
+				// GSVector4 v = ds/dt * m_shift[i];
+
+				vmulps(xmm2, xmm1, Xmm(4 + i));
+
+				if(m_sel.fst)
+				{
+					// m_env.d[i].si/ti = GSVector4i(v);
+
+					vcvttps2dq(xmm2, xmm2);
+
+					switch(j)
+					{
+					case 0: vmovdqa(ptr[&m_env.d[i].si], xmm2); break;
+					case 1: vmovdqa(ptr[&m_env.d[i].ti], xmm2); break;
+					}
+				}
+				else
+				{
+					// m_env.d[i].s/t/q = v;
+
+					switch(j)
+					{
+					case 0: vmovaps(ptr[&m_env.d[i].s], xmm2); break;
+					case 1: vmovaps(ptr[&m_env.d[i].t], xmm2); break;
+					case 2: vmovaps(ptr[&m_env.d[i].q], xmm2); break;
+					}
+				}
+			}
+		}
 	}
 	else
 	{
-		// m_env.d4.stq = t * 4.0f;
+		// GSVector4 t = dscan.t;
 
-		movaps(xmmword[&m_env.d4.stq], xmm1);
-	}
-
-	for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
-	{
-		// GSVector4 ds = t.xxxx();
-		// GSVector4 dt = t.yyyy();
-		// GSVector4 dq = t.zzzz();
+		movaps(xmm0, ptr[edx + 32]);
 
 		movaps(xmm1, xmm0);
-		shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
+		mulps(xmm1, xmm3);
 
-		for(int i = 0; i < 4; i++)
+		if(m_sel.fst)
 		{
-			// GSVector4 v = ds/dt * m_shift[i];
+			// m_env.d4.st = GSVector4i(t * 4.0f);
 
-			movaps(xmm2, xmm1);
-			mulps(xmm2, Xmm(4 + i));
+			cvttps2dq(xmm1, xmm1);
+			movdqa(ptr[&m_env.d4.st], xmm1);
+		}
+		else
+		{
+			// m_env.d4.stq = t * 4.0f;
 
-			if(m_sel.fst)
+			movaps(ptr[&m_env.d4.stq], xmm1);
+		}
+
+		for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
+		{
+			// GSVector4 ds = t.xxxx();
+			// GSVector4 dt = t.yyyy();
+			// GSVector4 dq = t.zzzz();
+
+			movaps(xmm1, xmm0);
+			shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
+
+			for(int i = 0; i < 4; i++)
 			{
-				// m_env.d[i].si/ti = GSVector4i(v);
+				// GSVector4 v = ds/dt * m_shift[i];
 
-				cvttps2dq(xmm2, xmm2);
+				movaps(xmm2, xmm1);
+				mulps(xmm2, Xmm(4 + i));
 
-				switch(j)
+				if(m_sel.fst)
 				{
-				case 0: movdqa(xmmword[&m_env.d[i].si], xmm2); break;
-				case 1: movdqa(xmmword[&m_env.d[i].ti], xmm2); break;
+					// m_env.d[i].si/ti = GSVector4i(v);
+
+					cvttps2dq(xmm2, xmm2);
+
+					switch(j)
+					{
+					case 0: movdqa(ptr[&m_env.d[i].si], xmm2); break;
+					case 1: movdqa(ptr[&m_env.d[i].ti], xmm2); break;
+					}
 				}
-			}
-			else
-			{
-				// m_env.d[i].s/t/q = v;
-
-				switch(j)
+				else
 				{
-				case 0: movaps(xmmword[&m_env.d[i].s], xmm2); break;
-				case 1: movaps(xmmword[&m_env.d[i].t], xmm2); break;
-				case 2: movaps(xmmword[&m_env.d[i].q], xmm2); break;
+					// m_env.d[i].s/t/q = v;
+
+					switch(j)
+					{
+					case 0: movaps(ptr[&m_env.d[i].s], xmm2); break;
+					case 1: movaps(ptr[&m_env.d[i].t], xmm2); break;
+					case 2: movaps(ptr[&m_env.d[i].q], xmm2); break;
+					}
 				}
 			}
 		}
@@ -257,113 +437,217 @@ void GSSetupPrimCodeGenerator::Color()
 		return;
 	}
 
-	if(m_sel.iip)
+	if(m_cpu.has(util::Cpu::tAVX))
 	{
-		// GSVector4 c = dscan.c;
-
-		movaps(xmm0, xmmword[edx]);
-		movaps(xmm1, xmm0);
-
-		// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
-
-		movaps(xmm2, xmm0);
-		mulps(xmm2, xmm3);
-		cvttps2dq(xmm2, xmm2);
-		pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
-		packssdw(xmm2, xmm2);
-		movdqa(xmmword[&m_env.d4.c], xmm2);
-
-		// xmm3 is not needed anymore
-
-		// GSVector4 dr = c.xxxx();
-		// GSVector4 db = c.zzzz();
-
-		shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
-		shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
-
-		for(int i = 0; i < 4; i++)
+		if(m_sel.iip)
 		{
-			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
+			// GSVector4 c = dscan.c;
 
-			movaps(xmm2, xmm0);
-			mulps(xmm2, Xmm(4 + i));
-			cvttps2dq(xmm2, xmm2);
-			packssdw(xmm2, xmm2);
+			vmovaps(xmm0, ptr[edx]);
 
-			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
+			// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
 
-			movaps(xmm3, xmm1);
-			mulps(xmm3, Xmm(4 + i));
-			cvttps2dq(xmm3, xmm3);
-			packssdw(xmm3, xmm3);
+			vmulps(xmm1, xmm0, xmm3);
+			vcvttps2dq(xmm1, xmm1);
+			vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
+			vpackssdw(xmm1, xmm1);
+			vmovdqa(ptr[&m_env.d4.c], xmm1);
 
-			// m_env.d[i].rb = r.upl16(b);
+			// xmm3 is not needed anymore
 
-			punpcklwd(xmm2, xmm3);
-			movdqa(xmmword[&m_env.d[i].rb], xmm2);
+			// GSVector4 dr = c.xxxx();
+			// GSVector4 db = c.zzzz();
+
+			vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+			vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+			for(int i = 0; i < 4; i++)
+			{
+				// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
+
+				vmulps(xmm0, xmm2, Xmm(4 + i));
+				vcvttps2dq(xmm0, xmm0);
+				vpackssdw(xmm0, xmm0);
+
+				// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
+
+				vmulps(xmm1, xmm3, Xmm(4 + i));
+				vcvttps2dq(xmm1, xmm1);
+				vpackssdw(xmm1, xmm1);
+
+				// m_env.d[i].rb = r.upl16(b);
+
+				vpunpcklwd(xmm0, xmm1);
+				vmovdqa(ptr[&m_env.d[i].rb], xmm0);
+			}
+
+			// GSVector4 c = dscan.c;
+
+			vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
+
+			// GSVector4 dg = c.yyyy();
+			// GSVector4 da = c.wwww();
+
+			vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+			vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+
+			for(int i = 0; i < 4; i++)
+			{
+				// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
+
+				vmulps(xmm0, xmm2, Xmm(4 + i));
+				vcvttps2dq(xmm0, xmm0);
+				vpackssdw(xmm0, xmm0);
+
+				// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
+
+				vmulps(xmm1, xmm3, Xmm(4 + i));
+				vcvttps2dq(xmm1, xmm1);
+				vpackssdw(xmm1, xmm1);
+
+				// m_env.d[i].ga = g.upl16(a);
+
+				vpunpcklwd(xmm0, xmm1);
+				vmovdqa(ptr[&m_env.d[i].ga], xmm0);
+			}
 		}
-
-		// GSVector4 c = dscan.c;
-
-		movaps(xmm0, xmmword[edx]); // not enough regs, have to reload it
-		movaps(xmm1, xmm0);
-
-		// GSVector4 dg = c.yyyy();
-		// GSVector4 da = c.wwww();
-
-		shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
-		shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
-
-		for(int i = 0; i < 4; i++)
+		else
 		{
-			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
+			// GSVector4i c = GSVector4i(vertices[0].c);
 
-			movaps(xmm2, xmm0);
-			mulps(xmm2, Xmm(4 + i));
-			cvttps2dq(xmm2, xmm2);
-			packssdw(xmm2, xmm2);
+			vcvttps2dq(xmm0, ptr[ecx]);
 
-			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
+			// c = c.upl16(c.zwxy());
 
-			movaps(xmm3, xmm1);
-			mulps(xmm3, Xmm(4 + i));
-			cvttps2dq(xmm3, xmm3);
-			packssdw(xmm3, xmm3);
+			vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
+			vpunpcklwd(xmm0, xmm1);
 
-			// m_env.d[i].ga = g.upl16(a);
+			// if(!tme) c = c.srl16(7);
 
-			punpcklwd(xmm2, xmm3);
-			movdqa(xmmword[&m_env.d[i].ga], xmm2);
+			if(m_sel.tfx == TFX_NONE)
+			{
+				vpsrlw(xmm0, 7);
+			}
+
+			// m_env.c.rb = c.xxxx();
+			// m_env.c.ga = c.zzzz();
+
+			vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+			vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+			vmovdqa(ptr[&m_env.c.rb], xmm1);
+			vmovdqa(ptr[&m_env.c.ga], xmm2);
 		}
 	}
 	else
 	{
-		// GSVector4i c = GSVector4i(vertices[0].c);
-
-		movaps(xmm0, xmmword[ecx]);
-		cvttps2dq(xmm0, xmm0);
-
-		// c = c.upl16(c.zwxy());
-
-		movdqa(xmm1, xmm0);
-		pshufd(xmm1, xmm1, _MM_SHUFFLE(1, 0, 3, 2));
-		punpcklwd(xmm0, xmm1);
-
-		// if(!tme) c = c.srl16(7);
-
-		if(m_sel.tfx == TFX_NONE)
+		if(m_sel.iip)
 		{
-			psrlw(xmm0, 7);
+			// GSVector4 c = dscan.c;
+
+			movaps(xmm0, ptr[edx]);
+			movaps(xmm1, xmm0);
+
+			// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
+
+			movaps(xmm2, xmm0);
+			mulps(xmm2, xmm3);
+			cvttps2dq(xmm2, xmm2);
+			pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
+			packssdw(xmm2, xmm2);
+			movdqa(ptr[&m_env.d4.c], xmm2);
+
+			// xmm3 is not needed anymore
+
+			// GSVector4 dr = c.xxxx();
+			// GSVector4 db = c.zzzz();
+
+			shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+			shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+
+			for(int i = 0; i < 4; i++)
+			{
+				// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
+
+				movaps(xmm2, xmm0);
+				mulps(xmm2, Xmm(4 + i));
+				cvttps2dq(xmm2, xmm2);
+				packssdw(xmm2, xmm2);
+
+				// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
+
+				movaps(xmm3, xmm1);
+				mulps(xmm3, Xmm(4 + i));
+				cvttps2dq(xmm3, xmm3);
+				packssdw(xmm3, xmm3);
+
+				// m_env.d[i].rb = r.upl16(b);
+
+				punpcklwd(xmm2, xmm3);
+				movdqa(ptr[&m_env.d[i].rb], xmm2);
+			}
+
+			// GSVector4 c = dscan.c;
+
+			movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
+			movaps(xmm1, xmm0);
+
+			// GSVector4 dg = c.yyyy();
+			// GSVector4 da = c.wwww();
+
+			shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+			shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
+
+			for(int i = 0; i < 4; i++)
+			{
+				// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
+
+				movaps(xmm2, xmm0);
+				mulps(xmm2, Xmm(4 + i));
+				cvttps2dq(xmm2, xmm2);
+				packssdw(xmm2, xmm2);
+
+				// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
+
+				movaps(xmm3, xmm1);
+				mulps(xmm3, Xmm(4 + i));
+				cvttps2dq(xmm3, xmm3);
+				packssdw(xmm3, xmm3);
+
+				// m_env.d[i].ga = g.upl16(a);
+
+				punpcklwd(xmm2, xmm3);
+				movdqa(ptr[&m_env.d[i].ga], xmm2);
+			}
 		}
+		else
+		{
+			// GSVector4i c = GSVector4i(vertices[0].c);
 
-		// m_env.c.rb = c.xxxx();
-		// m_env.c.ga = c.zzzz();
+			movaps(xmm0, ptr[ecx]);
+			cvttps2dq(xmm0, xmm0);
 
-		movdqa(xmm1, xmm0);
-		pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
-		pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
-		movdqa(xmmword[&m_env.c.rb], xmm0);
-		movdqa(xmmword[&m_env.c.ga], xmm1);
+			// c = c.upl16(c.zwxy());
+
+			pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
+			punpcklwd(xmm0, xmm1);
+
+			// if(!tme) c = c.srl16(7);
+
+			if(m_sel.tfx == TFX_NONE)
+			{
+				psrlw(xmm0, 7);
+			}
+
+			// m_env.c.rb = c.xxxx();
+			// m_env.c.ga = c.zzzz();
+
+			pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+			pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+			movdqa(ptr[&m_env.c.rb], xmm1);
+			movdqa(ptr[&m_env.c.ga], xmm2);
+		}
 	}
 }
 
diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp
index 6268ecf97f..803921d886 100644
--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@@ -84,7 +84,7 @@ GSState::GSState()
 	m_sssize += sizeof(m_tr.x);
 	m_sssize += sizeof(m_tr.y);
 	m_sssize += m_mem.m_vmsize;
-	m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].reg)) * ArraySize(m_path);
+	m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].reg)) * countof(m_path);
 	m_sssize += sizeof(m_q);
 
 	PRIM = &m_env.PRIM;
@@ -103,6 +103,7 @@ GSState::~GSState()
 void GSState::SetRegsMem(uint8* basemem)
 {
 	ASSERT(basemem);
+
 	m_regs = (GSPrivRegSet*)basemem;
 }
 
@@ -111,84 +112,82 @@ void GSState::SetIrqCallback(void (*irq)())
 	m_irq = irq;
 }
 
-void GSState::SetMultithreaded( bool isMT )
+void GSState::SetMultithreaded(bool mt)
 {
 	// Some older versions of PCSX2 didn't properly set the irq callback to NULL
 	// in multithreaded mode (possibly because ZeroGS itself would assert in such
 	// cases), and didn't bind them to a dummy callback either.  PCSX2 handles all
 	// IRQs internally when multithreaded anyway -- so let's ignore them here:
 
-	m_mt = isMT;
-	if( isMT )
+	m_mt = mt;
+
+	if(mt)
 	{
-		m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL]	= &GSState::GIFRegHandlerNull;
-		m_fpGIFRegHandlers[GIF_A_D_REG_FINISH]	= &GSState::GIFRegHandlerNull;
-		m_fpGIFRegHandlers[GIF_A_D_REG_LABEL]	= &GSState::GIFRegHandlerNull;
+		m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL] = &GSState::GIFRegHandlerNull;
+		m_fpGIFRegHandlers[GIF_A_D_REG_FINISH] = &GSState::GIFRegHandlerNull;
+		m_fpGIFRegHandlers[GIF_A_D_REG_LABEL] = &GSState::GIFRegHandlerNull;
 	}
 	else
 	{
-		m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL]	= &GSState::GIFRegHandlerSIGNAL;
-		m_fpGIFRegHandlers[GIF_A_D_REG_FINISH]	= &GSState::GIFRegHandlerFINISH;
-		m_fpGIFRegHandlers[GIF_A_D_REG_LABEL]	= &GSState::GIFRegHandlerLABEL;
+		m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL] = &GSState::GIFRegHandlerSIGNAL;
+		m_fpGIFRegHandlers[GIF_A_D_REG_FINISH] = &GSState::GIFRegHandlerFINISH;
+		m_fpGIFRegHandlers[GIF_A_D_REG_LABEL] = &GSState::GIFRegHandlerLABEL;
 	}
 }
 
 void GSState::SetFrameSkip(int skip)
 {
 	if(m_frameskip == skip) return;
+
 	m_frameskip = skip;
 
 	if(skip)
-	{
-		#if !UsePackedRegSwitch
-		m_fpGIFPackedRegHandlers[GIF_REG_XYZF2]		= &GSState::GIFPackedRegHandlerNOP;
-		m_fpGIFPackedRegHandlers[GIF_REG_XYZ2]		= &GSState::GIFPackedRegHandlerNOP;
-		m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1]	= &GSState::GIFPackedRegHandlerNOP;
-		m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2]	= &GSState::GIFPackedRegHandlerNOP;
-		m_fpGIFPackedRegHandlers[GIF_REG_FOG]		= &GSState::GIFPackedRegHandlerNOP;
-		m_fpGIFPackedRegHandlers[GIF_REG_XYZF3]		= &GSState::GIFPackedRegHandlerNOP;
-		m_fpGIFPackedRegHandlers[GIF_REG_XYZ3]		= &GSState::GIFPackedRegHandlerNOP;
-		#endif
+	{		
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerNOP;
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerNOP;
+		m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = &GSState::GIFPackedRegHandlerNOP;
+		m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = &GSState::GIFPackedRegHandlerNOP;
+		m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerNOP;
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = &GSState::GIFPackedRegHandlerNOP;
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = &GSState::GIFPackedRegHandlerNOP;
 
-		m_fpGIFRegHandlers[GIF_A_D_REG_PRIM]		= &GSState::GIFRegHandlerNOP;
-		m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ]		= &GSState::GIFRegHandlerNOP;
-		m_fpGIFRegHandlers[GIF_A_D_REG_ST]			= &GSState::GIFRegHandlerNOP;
-		m_fpGIFRegHandlers[GIF_A_D_REG_UV]			= &GSState::GIFRegHandlerNOP;
-		m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2]		= &GSState::GIFRegHandlerNOP;
-		m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2]		= &GSState::GIFRegHandlerNOP;
-		m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3]		= &GSState::GIFRegHandlerNOP;
-		m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3]		= &GSState::GIFRegHandlerNOP;
-		m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT]	= &GSState::GIFRegHandlerNOP;
-		m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE]		= &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerNOP;
 	}
 	else
 	{
-		#if !UsePackedRegSwitch
-		m_fpGIFPackedRegHandlers[GIF_REG_XYZF2]		= &GSState::GIFPackedRegHandlerXYZF2;
-		m_fpGIFPackedRegHandlers[GIF_REG_XYZ2]		= &GSState::GIFPackedRegHandlerXYZ2;
-		m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1]	= (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<0>;
-		m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2]	= (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<1>;
-		m_fpGIFPackedRegHandlers[GIF_REG_FOG]		= &GSState::GIFPackedRegHandlerFOG;
-		m_fpGIFPackedRegHandlers[GIF_REG_XYZF3]		= (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZF3;
-		m_fpGIFPackedRegHandlers[GIF_REG_XYZ3]		= (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZ3;
-		#endif
-		
-		m_fpGIFRegHandlers[GIF_A_D_REG_PRIM]		= &GSState::GIFRegHandlerPRIM;
-		m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ]		= &GSState::GIFRegHandlerRGBAQ;
-		m_fpGIFRegHandlers[GIF_A_D_REG_ST]			= &GSState::GIFRegHandlerST;
-		m_fpGIFRegHandlers[GIF_A_D_REG_UV]			= &GSState::GIFRegHandlerUV;
-		m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2]		= &GSState::GIFRegHandlerXYZF2;
-		m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2]		= &GSState::GIFRegHandlerXYZ2;
-		m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3]		= &GSState::GIFRegHandlerXYZF3;
-		m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3]		= &GSState::GIFRegHandlerXYZ3;
-		m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT]	= &GSState::GIFRegHandlerPRMODECONT;
-		m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE]		= &GSState::GIFRegHandlerPRMODE;
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerXYZF2;
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerXYZ2;
+		m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<0>;
+		m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<1>;
+		m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG;
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZF3;
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZ3;
+
+		m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM;
+		m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ;
+		m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerST;
+		m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerUV;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerXYZF2;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerXYZ2;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerXYZF3;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerXYZ3;
+		m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerPRMODECONT;
+		m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerPRMODE;
 	}
 }
 
 void GSState::Reset()
 {
-	memset(&m_path[0], 0, sizeof(m_path[0]) * ArraySize(m_path));
+	memset(&m_path[0], 0, sizeof(m_path[0]) * countof(m_path));
 	memset(&m_v, 0, sizeof(m_v));
 
 //	PRIM = &m_env.PRIM;
@@ -203,88 +202,86 @@ void GSState::Reset()
 
 void GSState::ResetHandlers()
 {
-	#if !UsePackedRegSwitch
 	for(int i = 0; i < countof(m_fpGIFPackedRegHandlers); i++)
 	{
 		m_fpGIFPackedRegHandlers[i] = &GSState::GIFPackedRegHandlerNull;
 	}
 
-	m_fpGIFPackedRegHandlers[GIF_REG_PRIM]		= (GIFPackedRegHandler)&GSState::GIFRegHandlerPRIM;
-	m_fpGIFPackedRegHandlers[GIF_REG_RGBA]		= &GSState::GIFPackedRegHandlerRGBA;
-	m_fpGIFPackedRegHandlers[GIF_REG_STQ]		= &GSState::GIFPackedRegHandlerSTQ;
-	m_fpGIFPackedRegHandlers[GIF_REG_UV]		= &GSState::GIFPackedRegHandlerUV;
-	m_fpGIFPackedRegHandlers[GIF_REG_XYZF2]		= &GSState::GIFPackedRegHandlerXYZF2;
-	m_fpGIFPackedRegHandlers[GIF_REG_XYZ2]		= &GSState::GIFPackedRegHandlerXYZ2;
-	m_fpGIFPackedRegHandlers[GIF_REG_TEX0_1]	= (GIFPackedRegHandler)&GSState::GIFRegHandlerTEX0<0>;
-	m_fpGIFPackedRegHandlers[GIF_REG_TEX0_2]	= (GIFPackedRegHandler)&GSState::GIFRegHandlerTEX0<1>;
-	m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1]	= (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<0>;
-	m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2]	= (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<1>;
-	m_fpGIFPackedRegHandlers[GIF_REG_FOG]		= &GSState::GIFPackedRegHandlerFOG;
-	m_fpGIFPackedRegHandlers[GIF_REG_XYZF3]		= (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZF3;
-	m_fpGIFPackedRegHandlers[GIF_REG_XYZ3]		= (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZ3;
-	m_fpGIFPackedRegHandlers[GIF_REG_A_D]		= &GSState::GIFPackedRegHandlerA_D;
-	m_fpGIFPackedRegHandlers[GIF_REG_NOP]		= &GSState::GIFPackedRegHandlerNOP;
-	#endif
-
+	m_fpGIFPackedRegHandlers[GIF_REG_PRIM] = (GIFPackedRegHandler)&GSState::GIFRegHandlerPRIM;
+	m_fpGIFPackedRegHandlers[GIF_REG_RGBA] = &GSState::GIFPackedRegHandlerRGBA;
+	m_fpGIFPackedRegHandlers[GIF_REG_STQ] = &GSState::GIFPackedRegHandlerSTQ;
+	m_fpGIFPackedRegHandlers[GIF_REG_UV] = &GSState::GIFPackedRegHandlerUV;
+	m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerXYZF2;
+	m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerXYZ2;
+	m_fpGIFPackedRegHandlers[GIF_REG_TEX0_1] = (GIFPackedRegHandler)&GSState::GIFRegHandlerTEX0<0>;
+	m_fpGIFPackedRegHandlers[GIF_REG_TEX0_2] = (GIFPackedRegHandler)&GSState::GIFRegHandlerTEX0<1>;
+	m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<0>;
+	m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<1>;
+	m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG;
+	m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZF3;
+	m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZ3;
+	m_fpGIFPackedRegHandlers[GIF_REG_A_D] = &GSState::GIFPackedRegHandlerA_D;
+	m_fpGIFPackedRegHandlers[GIF_REG_NOP] = &GSState::GIFPackedRegHandlerNOP;
+	
 	for(int i = 0; i < countof(m_fpGIFRegHandlers); i++)
 	{
 		m_fpGIFRegHandlers[i] = &GSState::GIFRegHandlerNull;
 	}
 
-	m_fpGIFRegHandlers[GIF_A_D_REG_PRIM]		= &GSState::GIFRegHandlerPRIM;
-	m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ]		= &GSState::GIFRegHandlerRGBAQ;
-	m_fpGIFRegHandlers[GIF_A_D_REG_ST]			= &GSState::GIFRegHandlerST;
-	m_fpGIFRegHandlers[GIF_A_D_REG_UV]			= &GSState::GIFRegHandlerUV;
-	m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2]		= &GSState::GIFRegHandlerXYZF2;
-	m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2]		= &GSState::GIFRegHandlerXYZ2;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_1]		= &GSState::GIFRegHandlerTEX0<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_2]		= &GSState::GIFRegHandlerTEX0<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_1]		= &GSState::GIFRegHandlerCLAMP<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_2]		= &GSState::GIFRegHandlerCLAMP<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_FOG]			= &GSState::GIFRegHandlerFOG;
-	m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3]		= &GSState::GIFRegHandlerXYZF3;
-	m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3]		= &GSState::GIFRegHandlerXYZ3;
-	m_fpGIFRegHandlers[GIF_A_D_REG_NOP]			= &GSState::GIFRegHandlerNOP;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_1]		= &GSState::GIFRegHandlerTEX1<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_2]		= &GSState::GIFRegHandlerTEX1<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_1]		= &GSState::GIFRegHandlerTEX2<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_2]		= &GSState::GIFRegHandlerTEX2<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_1]	= &GSState::GIFRegHandlerXYOFFSET<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_2]	= &GSState::GIFRegHandlerXYOFFSET<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT]	= &GSState::GIFRegHandlerPRMODECONT;
-	m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE]		= &GSState::GIFRegHandlerPRMODE;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEXCLUT]		= &GSState::GIFRegHandlerTEXCLUT;
-	m_fpGIFRegHandlers[GIF_A_D_REG_SCANMSK]		= &GSState::GIFRegHandlerSCANMSK;
-	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_1]	= &GSState::GIFRegHandlerMIPTBP1<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_2]	= &GSState::GIFRegHandlerMIPTBP1<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_1]	= &GSState::GIFRegHandlerMIPTBP2<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_2]	= &GSState::GIFRegHandlerMIPTBP2<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEXA]		= &GSState::GIFRegHandlerTEXA;
-	m_fpGIFRegHandlers[GIF_A_D_REG_FOGCOL]		= &GSState::GIFRegHandlerFOGCOL;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEXFLUSH]	= &GSState::GIFRegHandlerTEXFLUSH;
-	m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_1]	= &GSState::GIFRegHandlerSCISSOR<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_2]	= &GSState::GIFRegHandlerSCISSOR<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_1]		= &GSState::GIFRegHandlerALPHA<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_2]		= &GSState::GIFRegHandlerALPHA<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_DIMX]		= &GSState::GIFRegHandlerDIMX;
-	m_fpGIFRegHandlers[GIF_A_D_REG_DTHE]		= &GSState::GIFRegHandlerDTHE;
-	m_fpGIFRegHandlers[GIF_A_D_REG_COLCLAMP]	= &GSState::GIFRegHandlerCOLCLAMP;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEST_1]		= &GSState::GIFRegHandlerTEST<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TEST_2]		= &GSState::GIFRegHandlerTEST<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_PABE]		= &GSState::GIFRegHandlerPABE;
-	m_fpGIFRegHandlers[GIF_A_D_REG_FBA_1]		= &GSState::GIFRegHandlerFBA<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_FBA_2]		= &GSState::GIFRegHandlerFBA<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_1]		= &GSState::GIFRegHandlerFRAME<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_2]		= &GSState::GIFRegHandlerFRAME<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_1]		= &GSState::GIFRegHandlerZBUF<0>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_2]		= &GSState::GIFRegHandlerZBUF<1>;
-	m_fpGIFRegHandlers[GIF_A_D_REG_BITBLTBUF]	= &GSState::GIFRegHandlerBITBLTBUF;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TRXPOS]		= &GSState::GIFRegHandlerTRXPOS;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TRXREG]		= &GSState::GIFRegHandlerTRXREG;
-	m_fpGIFRegHandlers[GIF_A_D_REG_TRXDIR]		= &GSState::GIFRegHandlerTRXDIR;
-	m_fpGIFRegHandlers[GIF_A_D_REG_HWREG]		= &GSState::GIFRegHandlerHWREG;
+	m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM;
+	m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ;
+	m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerST;
+	m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerUV;
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerXYZF2;
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerXYZ2;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_1] = &GSState::GIFRegHandlerTEX0<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_2] = &GSState::GIFRegHandlerTEX0<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_1] = &GSState::GIFRegHandlerCLAMP<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_2] = &GSState::GIFRegHandlerCLAMP<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FOG] = &GSState::GIFRegHandlerFOG;
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerXYZF3;
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerXYZ3;
+	m_fpGIFRegHandlers[GIF_A_D_REG_NOP] = &GSState::GIFRegHandlerNOP;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_1] = &GSState::GIFRegHandlerTEX1<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_2] = &GSState::GIFRegHandlerTEX1<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_1] = &GSState::GIFRegHandlerTEX2<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_2] = &GSState::GIFRegHandlerTEX2<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_1] = &GSState::GIFRegHandlerXYOFFSET<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_2] = &GSState::GIFRegHandlerXYOFFSET<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerPRMODECONT;
+	m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerPRMODE;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEXCLUT] = &GSState::GIFRegHandlerTEXCLUT;
+	m_fpGIFRegHandlers[GIF_A_D_REG_SCANMSK] = &GSState::GIFRegHandlerSCANMSK;
+	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_1] = &GSState::GIFRegHandlerMIPTBP1<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_2] = &GSState::GIFRegHandlerMIPTBP1<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_1] = &GSState::GIFRegHandlerMIPTBP2<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_2] = &GSState::GIFRegHandlerMIPTBP2<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEXA] = &GSState::GIFRegHandlerTEXA;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FOGCOL] = &GSState::GIFRegHandlerFOGCOL;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEXFLUSH] = &GSState::GIFRegHandlerTEXFLUSH;
+	m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_1] = &GSState::GIFRegHandlerSCISSOR<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_2] = &GSState::GIFRegHandlerSCISSOR<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_1] = &GSState::GIFRegHandlerALPHA<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_2] = &GSState::GIFRegHandlerALPHA<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_DIMX] = &GSState::GIFRegHandlerDIMX;
+	m_fpGIFRegHandlers[GIF_A_D_REG_DTHE] = &GSState::GIFRegHandlerDTHE;
+	m_fpGIFRegHandlers[GIF_A_D_REG_COLCLAMP] = &GSState::GIFRegHandlerCOLCLAMP;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEST_1] = &GSState::GIFRegHandlerTEST<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEST_2] = &GSState::GIFRegHandlerTEST<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_PABE] = &GSState::GIFRegHandlerPABE;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FBA_1] = &GSState::GIFRegHandlerFBA<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FBA_2] = &GSState::GIFRegHandlerFBA<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_1] = &GSState::GIFRegHandlerFRAME<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_2] = &GSState::GIFRegHandlerFRAME<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_1] = &GSState::GIFRegHandlerZBUF<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_2] = &GSState::GIFRegHandlerZBUF<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_BITBLTBUF] = &GSState::GIFRegHandlerBITBLTBUF;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TRXPOS] = &GSState::GIFRegHandlerTRXPOS;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TRXREG] = &GSState::GIFRegHandlerTRXREG;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TRXDIR] = &GSState::GIFRegHandlerTRXDIR;
+	m_fpGIFRegHandlers[GIF_A_D_REG_HWREG] = &GSState::GIFRegHandlerHWREG;
 
-	SetMultithreaded( m_mt );
+	SetMultithreaded(m_mt);
 }
 
 GSVector4i GSState::GetDisplayRect(int i)
@@ -375,22 +372,24 @@ int GSState::GetFPS()
 
 // GIFPackedRegHandler*
 
-void GSState::GIFPackedRegHandlerNull(const GIFPackedReg* r)
+__forceinline void GSState::GIFPackedRegHandlerNull(const GIFPackedReg* r)
 {
 	// ASSERT(0);
 }
 
-void __fi GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* r)
+__forceinline void GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* r)
 {
 	#if _M_SSE >= 0x301
 
 	GSVector4i mask = GSVector4i::load(0x0c080400);
 	GSVector4i v = GSVector4i::load<false>(r).shuffle8(mask);
+
 	m_v.RGBAQ.u32[0] = (uint32)GSVector4i::store(v);
 
 	#elif _M_SSE >= 0x200
 
 	GSVector4i v = GSVector4i::load<false>(r) & GSVector4i::x000000ff();
+
 	m_v.RGBAQ.u32[0] = v.rgba32();
 
 	#else
@@ -405,7 +404,7 @@ void __fi GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* r)
 	m_v.RGBAQ.Q = m_q;
 }
 
-void __fi GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* r)
+__forceinline void GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* r)
 {
 	#if defined(_M_AMD64)
 
@@ -426,7 +425,7 @@ void __fi GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* r)
 	m_q = r->STQ.Q;
 }
 
-void __fi GSState::GIFPackedRegHandlerUV(const GIFPackedReg* r)
+__forceinline void GSState::GIFPackedRegHandlerUV(const GIFPackedReg* r)
 {
 	#if _M_SSE >= 0x200
 
@@ -441,7 +440,7 @@ void __fi GSState::GIFPackedRegHandlerUV(const GIFPackedReg* r)
 	#endif
 }
 
-void __fi GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* r)
+__forceinline void GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* r)
 {
 	m_v.XYZ.X = r->XYZF2.X;
 	m_v.XYZ.Y = r->XYZF2.Y;
@@ -451,7 +450,7 @@ void __fi GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* r)
 	VertexKick(r->XYZF2.ADC);
 }
 
-void __fi GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* r)
+__forceinline void GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* r)
 {
 	m_v.XYZ.X = r->XYZ2.X;
 	m_v.XYZ.Y = r->XYZ2.Y;
@@ -460,17 +459,17 @@ void __fi GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* r)
 	VertexKick(r->XYZ2.ADC);
 }
 
-void __fi GSState::GIFPackedRegHandlerFOG(const GIFPackedReg* r)
+__forceinline void GSState::GIFPackedRegHandlerFOG(const GIFPackedReg* r)
 {
 	m_v.FOG.F = r->FOG.F;
 }
 
-void __fi GSState::GIFPackedRegHandlerA_D(const GIFPackedReg* r)
+__forceinline void GSState::GIFPackedRegHandlerA_D(const GIFPackedReg* r)
 {
 	(this->*m_fpGIFRegHandlers[r->A_D.ADDR])(&r->r);
 }
 
-void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* r)
+__forceinline void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* r)
 {
 }
 
@@ -502,6 +501,8 @@ __forceinline void GSState::ApplyPRIM(const GIFRegPRIM& prim)
 
 	m_context = &m_env.CTXT[PRIM->CTXT];
 
+	UpdateVertexKick();
+
 	ResetPrim();
 }
 
@@ -510,22 +511,22 @@ void GSState::GIFRegHandlerPRIM(const GIFReg* r)
 	ApplyPRIM(r->PRIM);
 }
 
-void GSState::GIFRegHandlerRGBAQ(const GIFReg* r)
+__forceinline void GSState::GIFRegHandlerRGBAQ(const GIFReg* r)
 {
 	m_v.RGBAQ = (GSVector4i)r->RGBAQ;
 }
 
-void GSState::GIFRegHandlerST(const GIFReg* r)
+__forceinline void GSState::GIFRegHandlerST(const GIFReg* r)
 {
 	m_v.ST = (GSVector4i)r->ST;
 }
 
-void GSState::GIFRegHandlerUV(const GIFReg* r)
+__forceinline void GSState::GIFRegHandlerUV(const GIFReg* r)
 {
 	m_v.UV.u32[0] = r->UV.u32[0] & 0x3fff3fff;
 }
 
-__fi void GSState::GIFRegHandlerXYZF2(const GIFReg* r)
+void GSState::GIFRegHandlerXYZF2(const GIFReg* r)
 {
 /*
 	m_v.XYZ.X = r->XYZF.X;
@@ -540,14 +541,14 @@ __fi void GSState::GIFRegHandlerXYZF2(const GIFReg* r)
 	VertexKick(false);
 }
 
-__fi void GSState::GIFRegHandlerXYZ2(const GIFReg* r)
+void GSState::GIFRegHandlerXYZ2(const GIFReg* r)
 {
 	m_v.XYZ = (GSVector4i)r->XYZ;
 
 	VertexKick(false);
 }
 
-__fi void GSState::ApplyTEX0( uint i, GIFRegTEX0& TEX0 )
+void GSState::ApplyTEX0(uint i, GIFRegTEX0& TEX0)
 {
 	// even if TEX0 did not change, a new palette may have been uploaded and will overwrite the currently queued for drawing
 
@@ -578,7 +579,7 @@ __fi void GSState::ApplyTEX0( uint i, GIFRegTEX0& TEX0 )
 	}
 }
 
-template<int i> __fi void GSState::GIFRegHandlerTEX0(const GIFReg* r)
+template<int i> void GSState::GIFRegHandlerTEX0(const GIFReg* r)
 {
 	GIFRegTEX0 TEX0 = r->TEX0;
 
@@ -588,7 +589,7 @@ template<int i> __fi void GSState::GIFRegHandlerTEX0(const GIFReg* r)
 	ApplyTEX0( i, TEX0 );
 }
 
-template<int i> __fi void GSState::GIFRegHandlerCLAMP(const GIFReg* r)
+template<int i> void GSState::GIFRegHandlerCLAMP(const GIFReg* r)
 {
 	if(PRIM->CTXT == i && r->CLAMP != m_env.CTXT[i].CLAMP)
 	{
@@ -603,7 +604,7 @@ void GSState::GIFRegHandlerFOG(const GIFReg* r)
 	m_v.FOG = (GSVector4i)r->FOG;
 }
 
-__fi void GSState::GIFRegHandlerXYZF3(const GIFReg* r)
+void GSState::GIFRegHandlerXYZF3(const GIFReg* r)
 {
 /*
 	m_v.XYZ.X = r->XYZF.X;
@@ -618,7 +619,7 @@ __fi void GSState::GIFRegHandlerXYZF3(const GIFReg* r)
 	VertexKick(true);
 }
 
-__fi void GSState::GIFRegHandlerXYZ3(const GIFReg* r)
+void GSState::GIFRegHandlerXYZ3(const GIFReg* r)
 {
 	m_v.XYZ = (GSVector4i)r->XYZ;
 
@@ -629,7 +630,7 @@ void GSState::GIFRegHandlerNOP(const GIFReg* r)
 {
 }
 
-template<int i> __fi void GSState::GIFRegHandlerTEX1(const GIFReg* r)
+template<int i> void GSState::GIFRegHandlerTEX1(const GIFReg* r)
 {
 	if(PRIM->CTXT == i && r->TEX1 != m_env.CTXT[i].TEX1)
 	{
@@ -639,7 +640,7 @@ template<int i> __fi void GSState::GIFRegHandlerTEX1(const GIFReg* r)
 	m_env.CTXT[i].TEX1 = (GSVector4i)r->TEX1;
 }
 
-template<int i> __fi void GSState::GIFRegHandlerTEX2(const GIFReg* r)
+template<int i> void GSState::GIFRegHandlerTEX2(const GIFReg* r)
 {
 	// m_env.CTXT[i].TEX2 = r->TEX2; // not used
 
@@ -656,7 +657,7 @@ template<int i> __fi void GSState::GIFRegHandlerTEX2(const GIFReg* r)
 	ApplyTEX0(i, TEX0);
 }
 
-template<int i> __fi void GSState::GIFRegHandlerXYOFFSET(const GIFReg* r)
+template<int i> void GSState::GIFRegHandlerXYOFFSET(const GIFReg* r)
 {
 	GSVector4i o = (GSVector4i)r->XYOFFSET & GSVector4i::x0000ffff();
 
@@ -670,7 +671,7 @@ template<int i> __fi void GSState::GIFRegHandlerXYOFFSET(const GIFReg* r)
 	m_env.CTXT[i].UpdateScissor();
 }
 
-__fi void GSState::GIFRegHandlerPRMODECONT(const GIFReg* r)
+void GSState::GIFRegHandlerPRMODECONT(const GIFReg* r)
 {
 	if(r->PRMODECONT != m_env.PRMODECONT)
 	{
@@ -684,9 +685,11 @@ __fi void GSState::GIFRegHandlerPRMODECONT(const GIFReg* r)
 	// if(PRIM->PRIM == 7) printf("Invalid PRMODECONT/PRIM\n");
 
 	m_context = &m_env.CTXT[PRIM->CTXT];
+
+	UpdateVertexKick();
 }
 
-__fi void GSState::GIFRegHandlerPRMODE(const GIFReg* r)
+void GSState::GIFRegHandlerPRMODE(const GIFReg* r)
 {
 	if(!m_env.PRMODECONT.AC)
 	{
@@ -698,9 +701,11 @@ __fi void GSState::GIFRegHandlerPRMODE(const GIFReg* r)
 	m_env.PRMODE._PRIM = _PRIM;
 
 	m_context = &m_env.CTXT[PRIM->CTXT];
+
+	UpdateVertexKick();
 }
 
-__fi void GSState::GIFRegHandlerTEXCLUT(const GIFReg* r)
+void GSState::GIFRegHandlerTEXCLUT(const GIFReg* r)
 {
 	if(r->TEXCLUT != m_env.TEXCLUT)
 	{
@@ -730,7 +735,7 @@ template<int i> void GSState::GIFRegHandlerMIPTBP1(const GIFReg* r)
 	m_env.CTXT[i].MIPTBP1 = (GSVector4i)r->MIPTBP1;
 }
 
-template<int i> __fi void GSState::GIFRegHandlerMIPTBP2(const GIFReg* r)
+template<int i> void GSState::GIFRegHandlerMIPTBP2(const GIFReg* r)
 {
 	if(PRIM->CTXT == i && r->MIPTBP2 != m_env.CTXT[i].MIPTBP2)
 	{
@@ -767,7 +772,7 @@ void GSState::GIFRegHandlerTEXFLUSH(const GIFReg* r)
 	// InvalidateTextureCache();
 }
 
-template<int i> __fi void GSState::GIFRegHandlerSCISSOR(const GIFReg* r)
+template<int i> void GSState::GIFRegHandlerSCISSOR(const GIFReg* r)
 {
 	if(PRIM->CTXT == i && r->SCISSOR != m_env.CTXT[i].SCISSOR)
 	{
@@ -779,7 +784,7 @@ template<int i> __fi void GSState::GIFRegHandlerSCISSOR(const GIFReg* r)
 	m_env.CTXT[i].UpdateScissor();
 }
 
-template<int i> __fi void GSState::GIFRegHandlerALPHA(const GIFReg* r)
+template<int i> void GSState::GIFRegHandlerALPHA(const GIFReg* r)
 {
 	ASSERT(r->ALPHA.A != 3);
 	ASSERT(r->ALPHA.B != 3);
@@ -1142,66 +1147,6 @@ void GSState::Read(uint8* mem, int len)
 	m_mem.ReadImageX(m_tr.x, m_tr.y, mem, len, m_env.BITBLTBUF, m_env.TRXPOS, m_env.TRXREG);
 }
 
-// Use version 1 of the optimized local > local transfer, as per revision 887.
-// Later (more optimized?) versions cause a crash in Dark Cloud 2.
-#if 1
-void GSState::Move()
-{
-        // ffxii uses this to move the top/bottom of the scrolling menus offscreen and then blends them back over the text to create a shading effect
-        // guitar hero copies the far end of the board to do a similar blend too
-
-        int sx = m_env.TRXPOS.SSAX;
-        int dx = m_env.TRXPOS.DSAX;
-        int sy = m_env.TRXPOS.SSAY;
-        int dy = m_env.TRXPOS.DSAY;
-        int w = m_env.TRXREG.RRW;
-        int h = m_env.TRXREG.RRH;
-        int xinc = 1;
-        int yinc = 1;
-
-       	InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h));
-		InvalidateVideoMem(m_env.BITBLTBUF, GSVector4i(dx, dy, dx + w, dy + h));
-
-        if(sx < dx) sx += w-1, dx += w-1, xinc = -1;
-        if(sy < dy) sy += h-1, dy += h-1, yinc = -1;
-
-        const GSLocalMemory::psm_t& spsm = GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM];
-        const GSLocalMemory::psm_t& dpsm = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM];
-
-        if(m_env.BITBLTBUF.SPSM == PSM_PSMCT32 && m_env.BITBLTBUF.DPSM == PSM_PSMCT32)
-        {
-                for(int y = 0; y < h; y++, sy += yinc, dy += yinc, sx -= xinc*w, dx -= xinc*w)
-                {
-                        DWORD sbase = spsm.pa(0, sy, m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW);
-                        int* soffset = spsm.rowOffset[sy & 7];
-
-                        DWORD dbase = dpsm.pa(0, dy, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW);
-                        int* doffset = dpsm.rowOffset[dy & 7];
-
-                        for(int x = 0; x < w; x++, sx += xinc, dx += xinc)
-                        {
-                                m_mem.WritePixel32(dbase + doffset[dx], m_mem.ReadPixel32(sbase + soffset[sx]));
-                        }
-                }
-        }
-        else
-        {
-                for(int y = 0; y < h; y++, sy += yinc, dy += yinc, sx -= xinc*w, dx -= xinc*w)
-                {
-                        DWORD sbase = spsm.pa(0, sy, m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW);
-                        int* soffset = spsm.rowOffset[sy & 7];
-
-                        DWORD dbase = dpsm.pa(0, dy, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW);
-                        int* doffset = dpsm.rowOffset[dy & 7];
-
-                        for(int x = 0; x < w; x++, sx += xinc, dx += xinc)
-                        {
-                                (m_mem.*dpsm.wpa)(dbase + doffset[dx], (m_mem.*spsm.rpa)(sbase + soffset[sx]));
-                        }
-                }
-        }
-}
-#else
 void GSState::Move()
 {
 	// ffxii uses this to move the top/bottom of the scrolling menus offscreen and then blends them back over the text to create a shading effect
@@ -1346,10 +1291,7 @@ void GSState::Move()
 				int* RESTRICT scol = &spo->pixel.col[sy & 7][sx];
 				int* RESTRICT dcol = &dpo->pixel.col[dy & 7][dx];
 
-				for(int x = 0; x > -w; x--) {
-					printf("%d",x); //Dark Cloud 2 crashes at x = -63
-					d[dcol[x]] = s[scol[x]];
-				}
+				for(int x = 0; x > -w; x--) d[dcol[x]] = s[scol[x]];
 			}
 		}
 	}
@@ -1412,7 +1354,7 @@ void GSState::Move()
 		}
 	}
 }
-#endif
+
 void GSState::SoftReset(uint32 mask)
 {
 	if(mask & 1)
@@ -1508,91 +1450,7 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
 				{
 					do
 					{
-						uint32 reg = path.GetReg();
-
-						#if 0
-						// I assume this was some sort of debugging code?  Why intercept and perform
-						// special handling for the first three entries in the table, and then do
-						// a LUT for the rest?  Either do a switch for the whole table (best idea)
-						// or do a LUT for the whole table.
-						switch(reg)
-						{
-						case GIF_REG_RGBA:
-							GIFPackedRegHandlerRGBA((GIFPackedReg*)mem);
-							break;
-						case GIF_REG_STQ:
-							GIFPackedRegHandlerSTQ((GIFPackedReg*)mem);
-							break;
-						case GIF_REG_UV:
-							GIFPackedRegHandlerUV((GIFPackedReg*)mem);
-							break;
-						default:
-							(this->*m_fpGIFPackedRegHandlers[reg])((GIFPackedReg*)mem);
-							break;
-						}
-						#endif
-						
-						#if UsePackedRegSwitch
-						// This is a switch statement version of the LUT above.  Since there are only
-						// 16 entries, this is almost certainly ideal, since the compiler can inline
-						// all the handlers, and PGO will further optimize the switch dispatcher.
-						
-						if (FrameSkipIt)
-						{
-							// When skipping frames it looks like we only need to bother with the A_D handler
-							// and the TEX handlers.  (and I'm thinking the TEX handlers might not be necessary
-							// if the PCSX2 side of the frameskipper is smart enough anyway).
-							switch(reg)
-							{
-								case GIF_REG_A_D:		GIFPackedRegHandlerA_D	((GIFPackedReg*)mem);	break;
-								case GIF_REG_TEX0_1:	GIFRegHandlerTEX0<0>	((GIFReg*)mem);			break;
-								case GIF_REG_TEX0_2:	GIFRegHandlerTEX0<1>	((GIFReg*)mem);			break;
-
-								// Should RGBA/STQ/UV be NOPs when skipping frames?  I think so, but maybe the original
-								// switch() (above) was some hack to enable them in frameskipping mode. --air
-
-								case GIF_REG_RGBA:		//GIFPackedRegHandlerRGBA	((GIFPackedReg*)mem);	break;
-								case GIF_REG_STQ:		//GIFPackedRegHandlerSTQ	((GIFPackedReg*)mem);	break;
-								case GIF_REG_UV:		//GIFPackedRegHandlerUV	((GIFPackedReg*)mem);	break;
-
-								case GIF_REG_XYZF2:		//GIFPackedRegHandlerXYZF2((GIFPackedReg*)mem);	break;
-								case GIF_REG_XYZ2:		//GIFPackedRegHandlerXYZ2	((GIFPackedReg*)mem);	break;
-								case GIF_REG_CLAMP_1:	//GIFRegHandlerCLAMP<0>	((GIFReg*)mem);			break;
-								case GIF_REG_CLAMP_2:	//GIFRegHandlerCLAMP<1>	((GIFReg*)mem);			break;
-								case GIF_REG_FOG:		//GIFPackedRegHandlerFOG	((GIFPackedReg*)mem);	break;
-								case GIF_REG_XYZF3:		//GIFRegHandlerXYZF3		((GIFReg*)mem);			break;
-								case GIF_REG_XYZ3:		//GIFRegHandlerXYZ3		((GIFReg*)mem);			break;
-								case GIF_REG_NOP:		break;
-							}
-						}
-						else
-						{
-							switch(reg)
-							{
-								case GIF_REG_RGBA:		GIFPackedRegHandlerRGBA	((GIFPackedReg*)mem);	break;
-								case GIF_REG_STQ:		GIFPackedRegHandlerSTQ	((GIFPackedReg*)mem);	break;
-								case GIF_REG_UV:		GIFPackedRegHandlerUV	((GIFPackedReg*)mem);	break;
-								case GIF_REG_XYZF2:		GIFPackedRegHandlerXYZF2((GIFPackedReg*)mem);	break;
-								case GIF_REG_XYZ2:		GIFPackedRegHandlerXYZ2	((GIFPackedReg*)mem);	break;
-								case GIF_REG_TEX0_1:	GIFRegHandlerTEX0<0>	((GIFReg*)mem);			break;
-								case GIF_REG_TEX0_2:	GIFRegHandlerTEX0<1>	((GIFReg*)mem);			break;
-								case GIF_REG_CLAMP_1:	GIFRegHandlerCLAMP<0>	((GIFReg*)mem);			break;
-								case GIF_REG_CLAMP_2:	GIFRegHandlerCLAMP<1>	((GIFReg*)mem);			break;
-								case GIF_REG_FOG:		GIFPackedRegHandlerFOG	((GIFPackedReg*)mem);	break;
-								case GIF_REG_XYZF3:		GIFRegHandlerXYZF3		((GIFReg*)mem);			break;
-								case GIF_REG_XYZ3:		GIFRegHandlerXYZ3		((GIFReg*)mem);			break;
-								case GIF_REG_A_D:		GIFPackedRegHandlerA_D	((GIFPackedReg*)mem);	break;
-								case GIF_REG_NOP:		break;
-							}
-						}
-						#else
-
-						// This is the original LUT implementation of the packed reg dispatcher.
-						// Simple and clean, but the switch system below is probably more efficient.
-
-						(this->*m_fpGIFPackedRegHandlers[reg])((GIFPackedReg*)mem);
-
-						#endif
+						(this->*m_fpGIFPackedRegHandlers[path.GetReg()])((GIFPackedReg*)mem);
 
 						mem += sizeof(GIFPackedReg);
 						size--;
@@ -1779,7 +1637,7 @@ int GSState::Freeze(GSFreezeData* fd, bool sizeonly)
 	WriteState(data, &m_tr.y);
 	WriteState(data, m_mem.m_vm8, m_mem.m_vmsize);
 
-	for(int i = 0; i < ArraySize(m_path); i++)
+	for(int i = 0; i < countof(m_path); i++)
 	{
 		m_path[i].tag.NREG = m_path[i].nreg;
 		m_path[i].tag.NLOOP = m_path[i].nloop;
@@ -1874,7 +1732,7 @@ int GSState::Defrost(const GSFreezeData* fd)
 
 	m_tr.total = 0; // TODO: restore transfer state
 
-	for(int i = 0; i < ArraySize(m_path); i++)
+	for(int i = 0; i < countof(m_path); i++)
 	{
 		ReadState(&m_path[i].tag, data);
 		ReadState(&m_path[i].reg, data);
@@ -1888,6 +1746,8 @@ int GSState::Defrost(const GSFreezeData* fd)
 
 	m_context = &m_env.CTXT[PRIM->CTXT];
 
+	UpdateVertexKick();
+
 	m_env.UpdateDIMX();
 
 	for(int i = 0; i < 2; i++)
@@ -1918,7 +1778,7 @@ GSState::GSTransferBuffer::GSTransferBuffer()
 {
 	x = y = 0;
 	start = end = total = 0;
-	buff = (uint8*)_aligned_malloc(1024 * 1024 * 4, 16);
+	buff = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
 }
 
 GSState::GSTransferBuffer::~GSTransferBuffer()
diff --git a/plugins/GSdx/GSState.h b/plugins/GSdx/GSState.h
index a8875f02bd..56cffd5224 100644
--- a/plugins/GSdx/GSState.h
+++ b/plugins/GSdx/GSState.h
@@ -36,17 +36,11 @@
 #include "GSAlignedClass.h"
 #include "GSDump.h"
 
-// Set this to 1 to enable a switch statement instead of a LUT for the packed register handler
-// in the GifTransfer code.  Switch statement is probably faster, but it isn't fully implemented
-// yet (not properly supporting frameskipping).
-#define UsePackedRegSwitch 0
-
-class GSState : public GSAlignedClass<16>
+class GSState : public GSAlignedClass<32>
 {
-#if !UsePackedRegSwitch
 	typedef void (GSState::*GIFPackedRegHandler)(const GIFPackedReg* r);
+	
 	GIFPackedRegHandler m_fpGIFPackedRegHandlers[16];
-#endif
 
 	void GIFPackedRegHandlerNull(const GIFPackedReg* r);
 	void GIFPackedRegHandlerRGBA(const GIFPackedReg* r);
@@ -62,7 +56,7 @@ class GSState : public GSAlignedClass<16>
 
 	GIFRegHandler m_fpGIFRegHandlers[256];
 
-	void ApplyTEX0( uint i, GIFRegTEX0& TEX0 );
+	void ApplyTEX0(uint i, GIFRegTEX0& TEX0);
 	void ApplyPRIM(const GIFRegPRIM& PRIM);
 
 	void GIFRegHandlerNull(const GIFReg* r);
@@ -136,33 +130,67 @@ class GSState : public GSAlignedClass<16>
 protected:
 	bool IsBadFrame(int& skip, int UserHacks_SkipDraw);
 
-	typedef void (GSState::*DrawingKickPtr)(bool skip);
+	typedef void (GSState::*VertexKickPtr)(bool skip);
 
-	DrawingKickPtr m_dk[8];
+	VertexKickPtr m_vk[8][2][2];
+	VertexKickPtr m_vkf;
 
 	template<class T> void InitVertexKick()
 	{
-		m_dk[GS_POINTLIST]			= (DrawingKickPtr)&T::DrawingKick<GS_POINTLIST>;
-		m_dk[GS_LINELIST]			= (DrawingKickPtr)&T::DrawingKick<GS_LINELIST>;
-		m_dk[GS_LINESTRIP]			= (DrawingKickPtr)&T::DrawingKick<GS_LINESTRIP>;
-		m_dk[GS_TRIANGLELIST]		= (DrawingKickPtr)&T::DrawingKick<GS_TRIANGLELIST>;
-		m_dk[GS_TRIANGLESTRIP]		= (DrawingKickPtr)&T::DrawingKick<GS_TRIANGLESTRIP>;
-		m_dk[GS_TRIANGLEFAN]		= (DrawingKickPtr)&T::DrawingKick<GS_TRIANGLEFAN>;
-		m_dk[GS_SPRITE]				= (DrawingKickPtr)&T::DrawingKick<GS_SPRITE>;
-		m_dk[GS_INVALID]			= &GSState::DrawingKickNull;
+		m_vk[GS_POINTLIST][0][0] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 0, 0>;
+		m_vk[GS_POINTLIST][0][1] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 0, 0>;
+		m_vk[GS_POINTLIST][1][0] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 1, 0>;
+		m_vk[GS_POINTLIST][1][1] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 1, 1>;
+
+		m_vk[GS_LINELIST][0][0] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 0, 0>;
+		m_vk[GS_LINELIST][0][1] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 0, 0>;
+		m_vk[GS_LINELIST][1][0] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 1, 0>;
+		m_vk[GS_LINELIST][1][1] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 1, 1>;
+
+		m_vk[GS_LINESTRIP][0][0] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 0, 0>;
+		m_vk[GS_LINESTRIP][0][1] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 0, 0>;
+		m_vk[GS_LINESTRIP][1][0] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 1, 0>;
+		m_vk[GS_LINESTRIP][1][1] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 1, 1>;
+
+		m_vk[GS_TRIANGLELIST][0][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 0, 0>;
+		m_vk[GS_TRIANGLELIST][0][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 0, 0>;
+		m_vk[GS_TRIANGLELIST][1][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 1, 0>;
+		m_vk[GS_TRIANGLELIST][1][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 1, 1>;
+
+		m_vk[GS_TRIANGLESTRIP][0][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 0, 0>;
+		m_vk[GS_TRIANGLESTRIP][0][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 0, 0>;
+		m_vk[GS_TRIANGLESTRIP][1][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 1, 0>;
+		m_vk[GS_TRIANGLESTRIP][1][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 1, 1>;
+
+		m_vk[GS_TRIANGLEFAN][0][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 0, 0>;
+		m_vk[GS_TRIANGLEFAN][0][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 0, 0>;
+		m_vk[GS_TRIANGLEFAN][1][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 1, 0>;
+		m_vk[GS_TRIANGLEFAN][1][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 1, 1>;
+
+		m_vk[GS_SPRITE][0][0] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 0, 0>;
+		m_vk[GS_SPRITE][0][1] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 0, 0>;
+		m_vk[GS_SPRITE][1][0] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 1, 0>;
+		m_vk[GS_SPRITE][1][1] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 1, 1>;
+
+		m_vk[GS_INVALID][0][0] = &GSState::VertexKickNull;
+		m_vk[GS_INVALID][0][1] = &GSState::VertexKickNull;
+		m_vk[GS_INVALID][1][0] = &GSState::VertexKickNull;
+		m_vk[GS_INVALID][1][1] = &GSState::VertexKickNull;
 	}
 
-	void DrawingKickNull(bool skip)
+	void UpdateVertexKick()
+	{
+		m_vkf = m_vk[PRIM->PRIM][PRIM->TME][PRIM->FST];
+	}
+
+	void VertexKickNull(bool skip)
 	{
 		ASSERT(0);
 	}
 
-	virtual void DoVertexKick()=0;
-
-	__fi void VertexKick(bool skip)
+	void VertexKick(bool skip)
 	{
-		DoVertexKick();
-		(this->*m_dk[PRIM->PRIM])(skip);
+		(this->*m_vkf)(skip);
 	}
 
 public:
@@ -221,6 +249,6 @@ public:
 	void SetFrameSkip(int skip);
 	void SetRegsMem(uint8* basemem);
 	void SetIrqCallback(void (*irq)());
-	void SetMultithreaded(bool isMT=true);
+	void SetMultithreaded(bool mt = true);
 };
 
diff --git a/plugins/GSdx/GSTables.h b/plugins/GSdx/GSTables.h
index ff6a7e01b4..699b5879dd 100644
--- a/plugins/GSdx/GSTables.h
+++ b/plugins/GSdx/GSTables.h
@@ -37,9 +37,12 @@ extern const uint8 clutTableT32I8[128];
 extern const uint8 clutTableT32I4[16];
 extern const uint8 clutTableT16I8[32];
 extern const uint8 clutTableT16I4[16];
-struct D3D9Blend {
+
+struct D3D9Blend
+{
 	int bogus;
 	D3DBLENDOP op;
 	D3DBLEND src, dst;
 };
+
 extern const D3D9Blend blendMapD3D9[3*3*3*3];
diff --git a/plugins/GSdx/GSTexture.cpp b/plugins/GSdx/GSTexture.cpp
index 45d0647979..53e12d49e2 100644
--- a/plugins/GSdx/GSTexture.cpp
+++ b/plugins/GSdx/GSTexture.cpp
@@ -27,6 +27,6 @@ GSTexture::GSTexture()
 	, m_size(0, 0)
 	, m_type(None)
 	, m_msaa(false)
-	, LikelyOffset (false)
+	, LikelyOffset(false)
 {
 }
diff --git a/plugins/GSdx/GSTextureCache.cpp b/plugins/GSdx/GSTextureCache.cpp
index 8abbbb7217..f962bfba6b 100644
--- a/plugins/GSdx/GSTextureCache.cpp
+++ b/plugins/GSdx/GSTextureCache.cpp
@@ -836,11 +836,11 @@ GSTextureCache::Source::Source(GSRenderer* r)
 {
 	memset(m_valid, 0, sizeof(m_valid));
 
-	m_clut = (uint32*)_aligned_malloc(256 * sizeof(uint32), 16);
+	m_clut = (uint32*)_aligned_malloc(256 * sizeof(uint32), 32);
 
 	memset(m_clut, 0, sizeof(m_clut));
 
-	m_write.rect = (GSVector4i*)_aligned_malloc(3 * sizeof(GSVector4i), 16);
+	m_write.rect = (GSVector4i*)_aligned_malloc(3 * sizeof(GSVector4i), 32);
 	m_write.count = 0;
 }
 
@@ -1082,7 +1082,7 @@ void GSTextureCache::Target::Update()
 			}
 			else
 			{
-				static uint8* buff = (uint8*)::_aligned_malloc(1024 * 1024 * 4, 16);
+				static uint8* buff = (uint8*)::_aligned_malloc(1024 * 1024 * 4, 32);
 
 				int pitch = ((w + 3) & ~3) * 4;
 
diff --git a/plugins/GSdx/GSTextureCache.h b/plugins/GSdx/GSTextureCache.h
index cdd5d7c5db..8c099d86a5 100644
--- a/plugins/GSdx/GSTextureCache.h
+++ b/plugins/GSdx/GSTextureCache.h
@@ -39,7 +39,7 @@ public:
 		FMT_8,
 	};
 
-	class Surface : public GSAlignedClass<16>
+	class Surface : public GSAlignedClass<32>
 	{
 	protected:
 		GSRenderer* m_renderer;
diff --git a/plugins/GSdx/GSTextureCacheSW.cpp b/plugins/GSdx/GSTextureCacheSW.cpp
index d4b5fd0eb6..28efd91e46 100644
--- a/plugins/GSdx/GSTextureCacheSW.cpp
+++ b/plugins/GSdx/GSTextureCacheSW.cpp
@@ -253,7 +253,7 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX
 
 	if(m_buff == NULL)
 	{
-		m_buff = _aligned_malloc(tw * th * sizeof(uint32), 16);
+		m_buff = _aligned_malloc(tw * th * sizeof(uint32), 32);
 
 		if(m_buff == NULL)
 		{
diff --git a/plugins/GSdx/GSTextureFX11.cpp b/plugins/GSdx/GSTextureFX11.cpp
index e49e273682..4e99ad4168 100644
--- a/plugins/GSdx/GSTextureFX11.cpp
+++ b/plugins/GSdx/GSTextureFX11.cpp
@@ -137,6 +137,7 @@ void GSDevice11::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
 	}
 
 	VSSetShader(i->second.vs, m_vs_cb);
+
 	IASetInputLayout(i->second.il);
 }
 
diff --git a/plugins/GSdx/GSTextureFX9.cpp b/plugins/GSdx/GSTextureFX9.cpp
index 4e03b9594c..94ad8c1a91 100644
--- a/plugins/GSdx/GSTextureFX9.cpp
+++ b/plugins/GSdx/GSTextureFX9.cpp
@@ -69,7 +69,7 @@ void GSDevice9::SetupIA(const void* vertices, int count, int prim)
 
 void GSDevice9::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
 {
-	hash_map< uint32, GSVertexShader9 >::const_iterator i = m_vs.find(sel);
+	hash_map<uint32, GSVertexShader9>::const_iterator i = m_vs.find(sel);
 
 	if(i == m_vs.end())
 	{
@@ -110,6 +110,7 @@ void GSDevice9::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
 	}
 
 	VSSetShader(i->second.vs, (const float*)cb, sizeof(*cb) / sizeof(GSVector4));
+
 	IASetInputLayout(i->second.il);
 }
 
diff --git a/plugins/GSdx/GSVector.cpp b/plugins/GSdx/GSVector.cpp
index cfb62c47d1..8b76e96688 100644
--- a/plugins/GSdx/GSVector.cpp
+++ b/plugins/GSdx/GSVector.cpp
@@ -27,26 +27,6 @@ const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f);
 const GSVector4 GSVector4::m_x3f800000(_mm_castsi128_ps(_mm_set1_epi32(0x3f800000)));
 const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));
 
-GSVector4i::GSVector4i(const GSVector4& v)
-{
-	m = _mm_cvttps_epi32(v);
-}
-
-GSVector4::GSVector4(const GSVector4i& v)
-{
-	m = _mm_cvtepi32_ps(v);
-}
-
-GSVector4i GSVector4i::cast(const GSVector4& v)
-{
-	return GSVector4i(_mm_castps_si128(v.m));
-}
-
-GSVector4 GSVector4::cast(const GSVector4i& v)
-{
-	return GSVector4(_mm_castsi128_ps(v.m));
-}
-
 GSVector4i GSVector4i::fit(int arx, int ary) const
 {
 	GSVector4i r = *this;
diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h
index c7b920d4b9..61237dd8d3 100644
--- a/plugins/GSdx/GSVector.h
+++ b/plugins/GSdx/GSVector.h
@@ -64,11 +64,11 @@ public:
 		__m128i m;
 	};
 
-	GSVector4i()
+	__forceinline GSVector4i()
 	{
 	}
 
-	GSVector4i(int x, int y, int z, int w)
+	__forceinline GSVector4i(int x, int y, int z, int w)
 	{
 		// 4 gprs
 
@@ -82,86 +82,86 @@ public:
 		*this = xz.upl32(yw);
 	}
 
-	GSVector4i(int x, int y)
+	__forceinline GSVector4i(int x, int y)
 	{
 		*this = load(x).upl32(load(y));
 	}
 
-	GSVector4i(short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7)
+	__forceinline GSVector4i(short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7)
 	{
 		m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
 	}
 
-	GSVector4i(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+	__forceinline GSVector4i(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
 	{
 		m = _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0);
 	}
 
-	GSVector4i(const GSVector4i& v)
+	__forceinline GSVector4i(const GSVector4i& v)
 	{
 		m = v.m;
 	}
 
-	explicit GSVector4i(const GSVector2i& v)
+	__forceinline explicit GSVector4i(const GSVector2i& v)
 	{
 		m = _mm_loadl_epi64((__m128i*)&v);
 	}
 
-	explicit GSVector4i(int i)
+	__forceinline explicit GSVector4i(int i)
 	{
 		m = _mm_set1_epi32(i);
 	}
 
-	explicit GSVector4i(__m128i m)
+	__forceinline explicit GSVector4i(__m128i m)
 	{
 		this->m = m;
 	}
 
-	explicit GSVector4i(const GSVector4& v);
+	__forceinline explicit GSVector4i(const GSVector4& v);
 
-	void operator = (const GSVector4i& v)
+	__forceinline void operator = (const GSVector4i& v)
 	{
 		m = v.m;
 	}
 
-	void operator = (int i)
+	__forceinline void operator = (int i)
 	{
 		m = _mm_set1_epi32(i);
 	}
 
-	void operator = (__m128i m)
+	__forceinline void operator = (__m128i m)
 	{
 		this->m = m;
 	}
 
-	operator __m128i() const
+	__forceinline operator __m128i() const
 	{
 		return m;
 	}
 
 	// rect
 
-	int width() const
+	__forceinline int width() const
 	{
 		return right - left;
 	}
 
-	int height() const
+	__forceinline int height() const
 	{
 		return bottom - top;
 	}
 
-	GSVector4i rsize() const
+	__forceinline GSVector4i rsize() const
 	{
 		return *this - xyxy(); // same as GSVector4i(0, 0, width(), height());
 	}
 
-	bool rempty() const
+	__forceinline bool rempty() const
 	{
 		return (*this < zwzw()).mask() != 0x00ff;
 	}
 
-	GSVector4i runion(const GSVector4i& a) const
+	__forceinline GSVector4i runion(const GSVector4i& a) const
 	{
 		int i = (upl64(a) < uph64(a)).mask();
 
@@ -191,14 +191,14 @@ public:
 		return GSVector4i::zero();
 	}
 
-	GSVector4i rintersect(const GSVector4i& a) const
+	__forceinline GSVector4i rintersect(const GSVector4i& a) const
 	{
 		return sat_i32(a);
 	}
 
 	enum RoundMode {Outside, Inside, NegInf, PosInf};
 
-	template<int mode> GSVector4i ralign(const GSVector2i& a) const
+	template<int mode> __forceinline GSVector4i ralign(const GSVector2i& a) const
 	{
 		// a must be 1 << n
 
@@ -224,12 +224,12 @@ public:
 
 	#ifdef _WINDOWS
 
-	operator LPCRECT() const
+	__forceinline operator LPCRECT() const
 	{
 		return (LPCRECT)this;
 	}
 
-	operator LPRECT()
+	__forceinline operator LPRECT()
 	{
 		return (LPRECT)this;
 	}
@@ -238,7 +238,7 @@ public:
 
 	//
 
-	uint32 rgba32() const
+	__forceinline uint32 rgba32() const
 	{
 		GSVector4i v = *this;
 
@@ -252,43 +252,43 @@ public:
 
 	#if _M_SSE >= 0x401
 
-	GSVector4i sat_i8(const GSVector4i& a, const GSVector4i& b) const
+	__forceinline GSVector4i sat_i8(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_i8(a).min_i8(b);
 	}
 
-	GSVector4i sat_i8(const GSVector4i& a) const
+	__forceinline GSVector4i sat_i8(const GSVector4i& a) const
 	{
 		return max_i8(a.xyxy()).min_i8(a.zwzw());
 	}
 
 	#endif
 
-	GSVector4i sat_i16(const GSVector4i& a, const GSVector4i& b) const
+	__forceinline GSVector4i sat_i16(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_i16(a).min_i16(b);
 	}
 
-	GSVector4i sat_i16(const GSVector4i& a) const
+	__forceinline GSVector4i sat_i16(const GSVector4i& a) const
 	{
 		return max_i16(a.xyxy()).min_i16(a.zwzw());
 	}
 
 	#if _M_SSE >= 0x401
 
-	GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const
+	__forceinline GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_i32(a).min_i32(b);
 	}
 
-	GSVector4i sat_i32(const GSVector4i& a) const
+	__forceinline GSVector4i sat_i32(const GSVector4i& a) const
 	{
 		return max_i32(a.xyxy()).min_i32(a.zwzw());
 	}
 
 	#else
 
-	GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const
+	__forceinline GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const
 	{
 		GSVector4i v;
 
@@ -300,7 +300,7 @@ public:
 		return v;
 	}
 
-	GSVector4i sat_i32(const GSVector4i& a) const
+	__forceinline GSVector4i sat_i32(const GSVector4i& a) const
 	{
 		GSVector4i v;
 
@@ -314,24 +314,24 @@ public:
 
 	#endif
 
-	GSVector4i sat_u8(const GSVector4i& a, const GSVector4i& b) const
+	__forceinline GSVector4i sat_u8(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_u8(a).min_u8(b);
 	}
 
-	GSVector4i sat_u8(const GSVector4i& a) const
+	__forceinline GSVector4i sat_u8(const GSVector4i& a) const
 	{
 		return max_u8(a.xyxy()).min_u8(a.zwzw());
 	}
 
 	#if _M_SSE >= 0x401
 
-	GSVector4i sat_u16(const GSVector4i& a, const GSVector4i& b) const
+	__forceinline GSVector4i sat_u16(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_u16(a).min_u16(b);
 	}
 
-	GSVector4i sat_u16(const GSVector4i& a) const
+	__forceinline GSVector4i sat_u16(const GSVector4i& a) const
 	{
 		return max_u16(a.xyxy()).min_u16(a.zwzw());
 	}
@@ -340,12 +340,12 @@ public:
 
 	#if _M_SSE >= 0x401
 
-	GSVector4i sat_u32(const GSVector4i& a, const GSVector4i& b) const
+	__forceinline GSVector4i sat_u32(const GSVector4i& a, const GSVector4i& b) const
 	{
 		return max_u32(a).min_u32(b);
 	}
 
-	GSVector4i sat_u32(const GSVector4i& a) const
+	__forceinline GSVector4i sat_u32(const GSVector4i& a) const
 	{
 		return max_u32(a.xyxy()).min_u32(a.zwzw());
 	}
@@ -354,87 +354,87 @@ public:
 
 	#if _M_SSE >= 0x401
 
-	GSVector4i min_i8(const GSVector4i& a) const
+	__forceinline GSVector4i min_i8(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epi8(m, a));
 	}
 
-	GSVector4i max_i8(const GSVector4i& a) const
+	__forceinline GSVector4i max_i8(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_max_epi8(m, a));
 	}
 
 	#endif
 
-	GSVector4i min_i16(const GSVector4i& a) const
+	__forceinline GSVector4i min_i16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epi16(m, a));
 	}
 
-	GSVector4i max_i16(const GSVector4i& a) const
+	__forceinline GSVector4i max_i16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_max_epi16(m, a));
 	}
 
 	#if _M_SSE >= 0x401
 
-	GSVector4i min_i32(const GSVector4i& a) const
+	__forceinline GSVector4i min_i32(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epi32(m, a));
 	}
 
-	GSVector4i max_i32(const GSVector4i& a) const
+	__forceinline GSVector4i max_i32(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_max_epi32(m, a));
 	}
 
 	#endif
 
-	GSVector4i min_u8(const GSVector4i& a) const
+	__forceinline GSVector4i min_u8(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epu8(m, a));
 	}
 
-	GSVector4i max_u8(const GSVector4i& a) const
+	__forceinline GSVector4i max_u8(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_max_epu8(m, a));
 	}
 
 	#if _M_SSE >= 0x401
 
-	GSVector4i min_u16(const GSVector4i& a) const
+	__forceinline GSVector4i min_u16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epu16(m, a));
 	}
 
-	GSVector4i max_u16(const GSVector4i& a) const
+	__forceinline GSVector4i max_u16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_max_epu16(m, a));
 	}
 
-	GSVector4i min_u32(const GSVector4i& a) const
+	__forceinline GSVector4i min_u32(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_min_epu32(m, a));
 	}
 
-	GSVector4i max_u32(const GSVector4i& a) const
+	__forceinline GSVector4i max_u32(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_max_epu32(m, a));
 	}
 
 	#endif
 
-	static int min_i16(int a, int b)
+	__forceinline static int min_i16(int a, int b)
 	{
 		 return store(load(a).min_i16(load(b)));
 	}
 
-	GSVector4i clamp8() const
+	__forceinline GSVector4i clamp8() const
 	{
 		return pu16().upl8();
 	}
 
-	GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
+	__forceinline GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
 	{
 		#if _M_SSE >= 0x401
 
@@ -449,19 +449,19 @@ public:
 
 	#if _M_SSE >= 0x401
 
-	template<int mask> GSVector4i blend16(const GSVector4i& a) const
+	template<int mask> __forceinline GSVector4i blend16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_blend_epi16(m, a, mask));
 	}
 
 	#endif
 
-	GSVector4i blend(const GSVector4i& a, const GSVector4i& mask) const
+	__forceinline GSVector4i blend(const GSVector4i& a, const GSVector4i& mask) const
 	{
 		return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a)));
 	}
 
-	GSVector4i mix16(const GSVector4i& a) const
+	__forceinline GSVector4i mix16(const GSVector4i& a) const
 	{
 		#if _M_SSE >= 0x401
 
@@ -476,98 +476,98 @@ public:
 
 	#if _M_SSE >= 0x301
 
-	GSVector4i shuffle8(const GSVector4i& mask) const
+	__forceinline GSVector4i shuffle8(const GSVector4i& mask) const
 	{
 		return GSVector4i(_mm_shuffle_epi8(m, mask));
 	}
 
 	#endif
 
-	GSVector4i ps16(const GSVector4i& a) const
+	__forceinline GSVector4i ps16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_packs_epi16(m, a));
 	}
 
-	GSVector4i ps16() const
+	__forceinline GSVector4i ps16() const
 	{
 		return GSVector4i(_mm_packs_epi16(m, m));
 	}
 
-	GSVector4i pu16(const GSVector4i& a) const
+	__forceinline GSVector4i pu16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_packus_epi16(m, a));
 	}
 
-	GSVector4i pu16() const
+	__forceinline GSVector4i pu16() const
 	{
 		return GSVector4i(_mm_packus_epi16(m, m));
 	}
 
-	GSVector4i ps32(const GSVector4i& a) const
+	__forceinline GSVector4i ps32(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_packs_epi32(m, a));
 	}
 
-	GSVector4i ps32() const
+	__forceinline GSVector4i ps32() const
 	{
 		return GSVector4i(_mm_packs_epi32(m, m));
 	}
 
 	#if _M_SSE >= 0x401
 
-	GSVector4i pu32(const GSVector4i& a) const
+	__forceinline GSVector4i pu32(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_packus_epi32(m, a));
 	}
 
-	GSVector4i pu32() const
+	__forceinline GSVector4i pu32() const
 	{
 		return GSVector4i(_mm_packus_epi32(m, m));
 	}
 
 	#endif
 
-	GSVector4i upl8(const GSVector4i& a) const
+	__forceinline GSVector4i upl8(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_unpacklo_epi8(m, a));
 	}
 
-	GSVector4i uph8(const GSVector4i& a) const
+	__forceinline GSVector4i uph8(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_unpackhi_epi8(m, a));
 	}
 
-	GSVector4i upl16(const GSVector4i& a) const
+	__forceinline GSVector4i upl16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_unpacklo_epi16(m, a));
 	}
 
-	GSVector4i uph16(const GSVector4i& a) const
+	__forceinline GSVector4i uph16(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_unpackhi_epi16(m, a));
 	}
 
-	GSVector4i upl32(const GSVector4i& a) const
+	__forceinline GSVector4i upl32(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_unpacklo_epi32(m, a));
 	}
 
-	GSVector4i uph32(const GSVector4i& a) const
+	__forceinline GSVector4i uph32(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_unpackhi_epi32(m, a));
 	}
 
-	GSVector4i upl64(const GSVector4i& a) const
+	__forceinline GSVector4i upl64(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_unpacklo_epi64(m, a));
 	}
 
-	GSVector4i uph64(const GSVector4i& a) const
+	__forceinline GSVector4i uph64(const GSVector4i& a) const
 	{
 		return GSVector4i(_mm_unpackhi_epi64(m, a));
 	}
 
-	GSVector4i upl8() const
+	__forceinline GSVector4i upl8() const
 	{
 		#if 0 // _M_SSE >= 0x401 // TODO: compiler bug
 
@@ -580,12 +580,12 @@ public:
 		#endif
 	}
 
-	GSVector4i uph8() const
+	__forceinline GSVector4i uph8() const
 	{
 		return GSVector4i(_mm_unpackhi_epi8(m, _mm_setzero_si128()));
 	}
 
-	GSVector4i upl16() const
+	__forceinline GSVector4i upl16() const
 	{
 		#if 0 //_M_SSE >= 0x401 // TODO: compiler bug
 
@@ -598,12 +598,12 @@ public:
 		#endif
 	}
 
-	GSVector4i uph16() const
+	__forceinline GSVector4i uph16() const
 	{
 		return GSVector4i(_mm_unpackhi_epi16(m, _mm_setzero_si128()));
 	}
 
-	GSVector4i upl32() const
+	__forceinline GSVector4i upl32() const
 	{
 		#if 0 //_M_SSE >= 0x401 // TODO: compiler bug
 
@@ -616,17 +616,17 @@ public:
 		#endif
 	}
 
-	GSVector4i uph32() const
+	__forceinline GSVector4i uph32() const
 	{
 		return GSVector4i(_mm_unpackhi_epi32(m, _mm_setzero_si128()));
 	}
 
-	GSVector4i upl64() const
+	__forceinline GSVector4i upl64() const
 	{
 		return GSVector4i(_mm_unpacklo_epi64(m, _mm_setzero_si128()));
 	}
 
-	GSVector4i uph64() const
+	__forceinline GSVector4i uph64() const
 	{
 		return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128()));
 	}
@@ -638,101 +638,101 @@ public:
 	// MSVC (2008, 2010 ctp) believes that there is a "mem, reg" form of the pmovz/sx* instructions,
 	// turning these intrinsics into a minefield, don't spill regs when using them...
 
-	GSVector4i i8to16() const
+	__forceinline GSVector4i i8to16() const
 	{
 		return GSVector4i(_mm_cvtepi8_epi16(m));
 	}
 
-	GSVector4i u8to16() const
+	__forceinline GSVector4i u8to16() const
 	{
 		return GSVector4i(_mm_cvtepu8_epi16(m));
 	}
 
-	GSVector4i i8to32() const
+	__forceinline GSVector4i i8to32() const
 	{
 		return GSVector4i(_mm_cvtepi8_epi32(m));
 	}
 
-	GSVector4i u8to32() const
+	__forceinline GSVector4i u8to32() const
 	{
 		return GSVector4i(_mm_cvtepu8_epi32(m));
 	}
 
-	GSVector4i i8to64() const
+	__forceinline GSVector4i i8to64() const
 	{
 		return GSVector4i(_mm_cvtepi8_epi64(m));
 	}
 
-	GSVector4i u8to64() const
+	__forceinline GSVector4i u8to64() const
 	{
 		return GSVector4i(_mm_cvtepu16_epi64(m));
 	}
 
-	GSVector4i i16to32() const
+	__forceinline GSVector4i i16to32() const
 	{
 		return GSVector4i(_mm_cvtepi16_epi32(m));
 	}
 
-	GSVector4i u16to32() const
+	__forceinline GSVector4i u16to32() const
 	{
 		return GSVector4i(_mm_cvtepu16_epi32(m));
 	}
 
-	GSVector4i i16to64() const
+	__forceinline GSVector4i i16to64() const
 	{
 		return GSVector4i(_mm_cvtepi16_epi64(m));
 	}
 
-	GSVector4i u16to64() const
+	__forceinline GSVector4i u16to64() const
 	{
 		return GSVector4i(_mm_cvtepu16_epi64(m));
 	}
 
-	GSVector4i i32to64() const
+	__forceinline GSVector4i i32to64() const
 	{
 		return GSVector4i(_mm_cvtepi32_epi64(m));
 	}
 
-	GSVector4i u32to64() const
+	__forceinline GSVector4i u32to64() const
 	{
 		return GSVector4i(_mm_cvtepu32_epi64(m));
 	}
 
 	#else
 
-	GSVector4i u8to16() const
+	__forceinline GSVector4i u8to16() const
 	{
 		return upl8();
 	}
 
-	GSVector4i u8to32() const
+	__forceinline GSVector4i u8to32() const
 	{
 		return upl8().upl16();
 	}
 
-	GSVector4i u8to64() const
+	__forceinline GSVector4i u8to64() const
 	{
 		return upl8().upl16().upl32();
 	}
 
-	GSVector4i u16to32() const
+	__forceinline GSVector4i u16to32() const
 	{
 		return upl16();
 	}
 
-	GSVector4i u16to64() const
+	__forceinline GSVector4i u16to64() const
 	{
 		return upl16().upl32();
 	}
 
-	GSVector4i u32to64() const
+	__forceinline GSVector4i u32to64() const
 	{
 		return upl32();
 	}
 
 	#endif
 
-	template<int i> GSVector4i srl() const
+	template<int i> __forceinline GSVector4i srl() const
 	{
 		#pragma warning(push)
 		#pragma warning(disable: 4556)
@@ -742,7 +742,7 @@ public:
 		#pragma warning(pop)
 	}
 
-	template<int i> GSVector4i srl(const GSVector4i& v)
+	template<int i> __forceinline GSVector4i srl(const GSVector4i& v)
 	{
 		#if _M_SSE >= 0x301
 
@@ -759,7 +759,7 @@ public:
 		#endif
 	}
 
-	template<int i> GSVector4i sll() const
+	template<int i> __forceinline GSVector4i sll() const
 	{
 		#pragma warning(push)
 		#pragma warning(disable: 4556)
@@ -769,172 +769,172 @@ public:
 		#pragma warning(pop)
 	}
 
-	GSVector4i sra16(int i) const
+	__forceinline GSVector4i sra16(int i) const
 	{
 		return GSVector4i(_mm_srai_epi16(m, i));
 	}
 
-	GSVector4i sra32(int i) const
+	__forceinline GSVector4i sra32(int i) const
 	{
 		return GSVector4i(_mm_srai_epi32(m, i));
 	}
 
-	GSVector4i sll16(int i) const
+	__forceinline GSVector4i sll16(int i) const
 	{
 		return GSVector4i(_mm_slli_epi16(m, i));
 	}
 
-	GSVector4i sll32(int i) const
+	__forceinline GSVector4i sll32(int i) const
 	{
 		return GSVector4i(_mm_slli_epi32(m, i));
 	}
 
-	GSVector4i sll64(int i) const
+	__forceinline GSVector4i sll64(int i) const
 	{
 		return GSVector4i(_mm_slli_epi64(m, i));
 	}
 
-	GSVector4i srl16(int i) const
+	__forceinline GSVector4i srl16(int i) const
 	{
 		return GSVector4i(_mm_srli_epi16(m, i));
 	}
 
-	GSVector4i srl32(int i) const
+	__forceinline GSVector4i srl32(int i) const
 	{
 		return GSVector4i(_mm_srli_epi32(m, i));
 	}
 
-	GSVector4i srl64(int i) const
+	__forceinline GSVector4i srl64(int i) const
 	{
 		return GSVector4i(_mm_srli_epi64(m, i));
 	}
 
-	GSVector4i add8(const GSVector4i& v) const
+	__forceinline GSVector4i add8(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_add_epi8(m, v.m));
 	}
 
-	GSVector4i add16(const GSVector4i& v) const
+	__forceinline GSVector4i add16(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_add_epi16(m, v.m));
 	}
 
-	GSVector4i add32(const GSVector4i& v) const
+	__forceinline GSVector4i add32(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_add_epi32(m, v.m));
 	}
 
-	GSVector4i adds8(const GSVector4i& v) const
+	__forceinline GSVector4i adds8(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_adds_epi8(m, v.m));
 	}
 
-	GSVector4i adds16(const GSVector4i& v) const
+	__forceinline GSVector4i adds16(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_adds_epi16(m, v.m));
 	}
 
-	GSVector4i addus8(const GSVector4i& v) const
+	__forceinline GSVector4i addus8(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_adds_epu8(m, v.m));
 	}
 
-	GSVector4i addus16(const GSVector4i& v) const
+	__forceinline GSVector4i addus16(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_adds_epu16(m, v.m));
 	}
 
-	GSVector4i sub8(const GSVector4i& v) const
+	__forceinline GSVector4i sub8(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_sub_epi8(m, v.m));
 	}
 
-	GSVector4i sub16(const GSVector4i& v) const
+	__forceinline GSVector4i sub16(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_sub_epi16(m, v.m));
 	}
 
-	GSVector4i sub32(const GSVector4i& v) const
+	__forceinline GSVector4i sub32(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_sub_epi32(m, v.m));
 	}
 
-	GSVector4i subs8(const GSVector4i& v) const
+	__forceinline GSVector4i subs8(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_subs_epi8(m, v.m));
 	}
 
-	GSVector4i subs16(const GSVector4i& v) const
+	__forceinline GSVector4i subs16(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_subs_epi16(m, v.m));
 	}
 
-	GSVector4i subus8(const GSVector4i& v) const
+	__forceinline GSVector4i subus8(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_subs_epu8(m, v.m));
 	}
 
-	GSVector4i subus16(const GSVector4i& v) const
+	__forceinline GSVector4i subus16(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_subs_epu16(m, v.m));
 	}
 
-	GSVector4i avg8(const GSVector4i& v) const
+	__forceinline GSVector4i avg8(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_avg_epu8(m, v.m));
 	}
 
-	GSVector4i avg16(const GSVector4i& v) const
+	__forceinline GSVector4i avg16(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_avg_epu16(m, v.m));
 	}
 
-	GSVector4i mul16hs(const GSVector4i& v) const
+	__forceinline GSVector4i mul16hs(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_mulhi_epi16(m, v.m));
 	}
 
-	GSVector4i mul16hu(const GSVector4i& v) const
+	__forceinline GSVector4i mul16hu(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_mulhi_epu16(m, v.m));
 	}
 
-	GSVector4i mul16l(const GSVector4i& v) const
+	__forceinline GSVector4i mul16l(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_mullo_epi16(m, v.m));
 	}
 
 	#if _M_SSE >= 0x301
 
-	GSVector4i mul16hrs(const GSVector4i& v) const
+	__forceinline GSVector4i mul16hrs(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_mulhrs_epi16(m, v.m));
 	}
 
 	#endif
 
-	template<int shift> GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
+	template<int shift> __forceinline GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
 	{
 		// (a - this) * f << shift + this
 
 		return add16(a.sub16(*this).modulate16<shift>(f));
 	}
 
-	template<int shift> static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
+	template<int shift> __forceinline static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
 	{
 		// (a - b) * c << shift
 
 		return a.sub16(b).modulate16<shift>(c);
 	}
 
-	template<int shift> static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, const GSVector4i& d)
+	template<int shift> __forceinline static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, const GSVector4i& d)
 	{
 		// (a - b) * c << shift + d
 
 		return d.add16(a.sub16(b).modulate16<shift>(c));
 	}
 
-	template<int shift> GSVector4i modulate16(const GSVector4i& f) const
+	template<int shift> __forceinline GSVector4i modulate16(const GSVector4i& f) const
 	{
 		// a * f << shift
 
@@ -950,7 +950,7 @@ public:
 		return sll16(shift + 1).mul16hs(f);
 	}
 
-	bool eq(const GSVector4i& v) const
+	__forceinline bool eq(const GSVector4i& v) const
 	{
 		#if _M_SSE >= 0x401
 		// pxor, ptest, je
@@ -962,82 +962,82 @@ public:
 		#endif
 	}
 
-	GSVector4i eq8(const GSVector4i& v) const
+	__forceinline GSVector4i eq8(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_cmpeq_epi8(m, v.m));
 	}
 
-	GSVector4i eq16(const GSVector4i& v) const
+	__forceinline GSVector4i eq16(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_cmpeq_epi16(m, v.m));
 	}
 
-	GSVector4i eq32(const GSVector4i& v) const
+	__forceinline GSVector4i eq32(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_cmpeq_epi32(m, v.m));
 	}
 
-	GSVector4i neq8(const GSVector4i& v) const
+	__forceinline GSVector4i neq8(const GSVector4i& v) const
 	{
 		return ~eq8(v);
 	}
 
-	GSVector4i neq16(const GSVector4i& v) const
+	__forceinline GSVector4i neq16(const GSVector4i& v) const
 	{
 		return ~eq16(v);
 	}
 
-	GSVector4i neq32(const GSVector4i& v) const
+	__forceinline GSVector4i neq32(const GSVector4i& v) const
 	{
 		return ~eq32(v);
 	}
 
-	GSVector4i gt8(const GSVector4i& v) const
+	__forceinline GSVector4i gt8(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_cmpgt_epi8(m, v.m));
 	}
 
-	GSVector4i gt16(const GSVector4i& v) const
+	__forceinline GSVector4i gt16(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_cmpgt_epi16(m, v.m));
 	}
 
-	GSVector4i gt32(const GSVector4i& v) const
+	__forceinline GSVector4i gt32(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_cmpgt_epi32(m, v.m));
 	}
 
-	GSVector4i lt8(const GSVector4i& v) const
+	__forceinline GSVector4i lt8(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_cmplt_epi8(m, v.m));
 	}
 
-	GSVector4i lt16(const GSVector4i& v) const
+	__forceinline GSVector4i lt16(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_cmplt_epi16(m, v.m));
 	}
 
-	GSVector4i lt32(const GSVector4i& v) const
+	__forceinline GSVector4i lt32(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_cmplt_epi32(m, v.m));
 	}
 
-	GSVector4i andnot(const GSVector4i& v) const
+	__forceinline GSVector4i andnot(const GSVector4i& v) const
 	{
 		return GSVector4i(_mm_andnot_si128(v.m, m));
 	}
 
-	int mask() const
+	__forceinline int mask() const
 	{
 		return _mm_movemask_epi8(m);
 	}
 
-	bool alltrue() const
+	__forceinline bool alltrue() const
 	{
 		return _mm_movemask_epi8(m) == 0xffff;
 	}
 
-	bool allfalse() const
+	__forceinline bool allfalse() const
 	{
 		#if _M_SSE >= 0x401
 		return _mm_testz_si128(m, m) != 0;
@@ -1048,14 +1048,14 @@ public:
 
 	#if _M_SSE >= 0x401
 
-	template<int i> GSVector4i insert8(int a) const
+	template<int i> __forceinline GSVector4i insert8(int a) const
 	{
 		return GSVector4i(_mm_insert_epi8(m, a, i));
 	}
 
 	#endif
 
-	template<int i> int extract8() const
+	template<int i> __forceinline int extract8() const
 	{
 		#if _M_SSE >= 0x401
 		return _mm_extract_epi8(m, i);
@@ -1064,26 +1064,26 @@ public:
 		#endif
 	}
 
-	template<int i> GSVector4i insert16(int a) const
+	template<int i> __forceinline GSVector4i insert16(int a) const
 	{
 		return GSVector4i(_mm_insert_epi16(m, a, i));
 	}
 
-	template<int i> int extract16() const
+	template<int i> __forceinline int extract16() const
 	{
 		return _mm_extract_epi16(m, i);
 	}
 
 	#if _M_SSE >= 0x401
 
-	template<int i> GSVector4i insert32(int a) const
+	template<int i> __forceinline GSVector4i insert32(int a) const
 	{
 		return GSVector4i(_mm_insert_epi32(m, a, i));
 	}
 
 	#endif
 
-	template<int i> int extract32() const
+	template<int i> __forceinline int extract32() const
 	{
 		if(i == 0) return GSVector4i::store(*this);
 		#if _M_SSE >= 0x401
@@ -1097,14 +1097,14 @@ public:
 
 	#if _M_SSE >= 0x401
 
-	template<int i> GSVector4i insert64(int64 a) const
+	template<int i> __forceinline GSVector4i insert64(int64 a) const
 	{
 		return GSVector4i(_mm_insert_epi64(m, a, i));
 	}
 
 	#endif
 
-	template<int i> int64 extract64() const
+	template<int i> __forceinline int64 extract64() const
 	{
 		if(i == 0) return GSVector4i::storeq(*this);
 		#if _M_SSE >= 0x401
@@ -1594,7 +1594,7 @@ public:
 
 	#endif
 
-	static GSVector4i loadnt(const void* p)
+	__forceinline static GSVector4i loadnt(const void* p)
 	{
 		#if _M_SSE >= 0x401
 
@@ -1607,27 +1607,27 @@ public:
 		#endif
 	}
 
-	static GSVector4i loadl(const void* p)
+	__forceinline static GSVector4i loadl(const void* p)
 	{
 		return GSVector4i(_mm_loadl_epi64((__m128i*)p));
 	}
 
-	static GSVector4i loadh(const void* p)
+	__forceinline static GSVector4i loadh(const void* p)
 	{
 		return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), (__m64*)p)));
 	}
 
-	static GSVector4i loadh(const void* p, const GSVector4i& v)
+	__forceinline static GSVector4i loadh(const void* p, const GSVector4i& v)
 	{
 		return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(v.m), (__m64*)p)));
 	}
 
-	static GSVector4i load(const void* pl, const void* ph)
+	__forceinline static GSVector4i load(const void* pl, const void* ph)
 	{
 		return loadh(ph, loadl(pl));
 	}
 /*
-	static GSVector4i load(const void* pl, const void* ph)
+	__forceinline static GSVector4i load(const void* pl, const void* ph)
 	{
 		__m128i lo = _mm_loadl_epi64((__m128i*)pl);
 		__m128i hi = _mm_loadl_epi64((__m128i*)ph);
@@ -1635,67 +1635,67 @@ public:
 		return GSVector4i(_mm_unpacklo_epi64(lo, hi));
 	}
 */
-	template<bool aligned> static GSVector4i load(const void* p)
+	template<bool aligned> __forceinline static GSVector4i load(const void* p)
 	{
 		return GSVector4i(aligned ? _mm_load_si128((__m128i*)p) : _mm_loadu_si128((__m128i*)p));
 	}
 
-	static GSVector4i load(int i)
+	__forceinline static GSVector4i load(int i)
 	{
 		return GSVector4i(_mm_cvtsi32_si128(i));
 	}
 
 	#ifdef _M_AMD64
 
-	static GSVector4i loadq(int64 i)
+	__forceinline static GSVector4i loadq(int64 i)
 	{
 		return GSVector4i(_mm_cvtsi64_si128(i));
 	}
 
 	#endif
 
-	static void storent(void* p, const GSVector4i& v)
+	__forceinline static void storent(void* p, const GSVector4i& v)
 	{
 		_mm_stream_si128((__m128i*)p, v.m);
 	}
 
-	static void storel(void* p, const GSVector4i& v)
+	__forceinline static void storel(void* p, const GSVector4i& v)
 	{
 		_mm_storel_epi64((__m128i*)p, v.m);
 	}
 
-	static void storeh(void* p, const GSVector4i& v)
+	__forceinline static void storeh(void* p, const GSVector4i& v)
 	{
 		_mm_storeh_pi((__m64*)p, _mm_castsi128_ps(v.m));
 	}
 
-	static void store(void* pl, void* ph, const GSVector4i& v)
+	__forceinline static void store(void* pl, void* ph, const GSVector4i& v)
 	{
 		GSVector4i::storel(pl, v);
 		GSVector4i::storeh(ph, v);
 	}
 
-	template<bool aligned> static void store(void* p, const GSVector4i& v)
+	template<bool aligned> __forceinline static void store(void* p, const GSVector4i& v)
 	{
 		if(aligned) _mm_store_si128((__m128i*)p, v.m);
 		else _mm_storeu_si128((__m128i*)p, v.m);
 	}
 
-	static int store(const GSVector4i& v)
+	__forceinline static int store(const GSVector4i& v)
 	{
 		return _mm_cvtsi128_si32(v.m);
 	}
 
 	#ifdef _M_AMD64
 
-	static int64 storeq(const GSVector4i& v)
+	__forceinline static int64 storeq(const GSVector4i& v)
 	{
 		return _mm_cvtsi128_si64(v.m);
 	}
 
 	#endif
 
-	static void storent(void* RESTRICT dst, const void* RESTRICT src, size_t size)
+	__forceinline static void storent(void* RESTRICT dst, const void* RESTRICT src, size_t size)
 	{
 		const GSVector4i* s = (const GSVector4i*)src;
 		GSVector4i* d = (GSVector4i*)dst;
@@ -1877,156 +1877,156 @@ public:
 		return v.alltrue();
 	}
 
-	void operator += (const GSVector4i& v)
+	__forceinline void operator += (const GSVector4i& v)
 	{
 		m = _mm_add_epi32(m, v);
 	}
 
-	void operator -= (const GSVector4i& v)
+	__forceinline void operator -= (const GSVector4i& v)
 	{
 		m = _mm_sub_epi32(m, v);
 	}
 
-	void operator += (int i)
+	__forceinline void operator += (int i)
 	{
 		*this += GSVector4i(i);
 	}
 
-	void operator -= (int i)
+	__forceinline void operator -= (int i)
 	{
 		*this -= GSVector4i(i);
 	}
 
-	void operator <<= (const int i)
+	__forceinline void operator <<= (const int i)
 	{
 		m = _mm_slli_epi32(m, i);
 	}
 
-	void operator >>= (const int i)
+	__forceinline void operator >>= (const int i)
 	{
 		m = _mm_srli_epi32(m, i);
 	}
 
-	void operator &= (const GSVector4i& v)
+	__forceinline void operator &= (const GSVector4i& v)
 	{
 		m = _mm_and_si128(m, v);
 	}
 
-	void operator |= (const GSVector4i& v)
+	__forceinline void operator |= (const GSVector4i& v)
 	{
 		m = _mm_or_si128(m, v);
 	}
 
-	void operator ^= (const GSVector4i& v)
+	__forceinline void operator ^= (const GSVector4i& v)
 	{
 		m = _mm_xor_si128(m, v);
 	}
 
-	friend GSVector4i operator + (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator + (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return GSVector4i(_mm_add_epi32(v1, v2));
 	}
 
-	friend GSVector4i operator - (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator - (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return GSVector4i(_mm_sub_epi32(v1, v2));
 	}
 
-	friend GSVector4i operator + (const GSVector4i& v, int i)
+	__forceinline friend GSVector4i operator + (const GSVector4i& v, int i)
 	{
 		return v + GSVector4i(i);
 	}
 
-	friend GSVector4i operator - (const GSVector4i& v, int i)
+	__forceinline friend GSVector4i operator - (const GSVector4i& v, int i)
 	{
 		return v - GSVector4i(i);
 	}
 
-	friend GSVector4i operator << (const GSVector4i& v, const int i)
+	__forceinline friend GSVector4i operator << (const GSVector4i& v, const int i)
 	{
 		return GSVector4i(_mm_slli_epi32(v, i));
 	}
 
-	friend GSVector4i operator >> (const GSVector4i& v, const int i)
+	__forceinline friend GSVector4i operator >> (const GSVector4i& v, const int i)
 	{
 		return GSVector4i(_mm_srli_epi32(v, i));
 	}
 
-	friend GSVector4i operator & (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator & (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return GSVector4i(_mm_and_si128(v1, v2));
 	}
 
-	friend GSVector4i operator | (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator | (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return GSVector4i(_mm_or_si128(v1, v2));
 	}
 
-	friend GSVector4i operator ^ (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator ^ (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return GSVector4i(_mm_xor_si128(v1, v2));
 	}
 
-	friend GSVector4i operator & (const GSVector4i& v, int i)
+	__forceinline friend GSVector4i operator & (const GSVector4i& v, int i)
 	{
 		return v & GSVector4i(i);
 	}
 
-	friend GSVector4i operator | (const GSVector4i& v, int i)
+	__forceinline friend GSVector4i operator | (const GSVector4i& v, int i)
 	{
 		return v | GSVector4i(i);
 	}
 
-	friend GSVector4i operator ^ (const GSVector4i& v, int i)
+	__forceinline friend GSVector4i operator ^ (const GSVector4i& v, int i)
 	{
 		return v ^ GSVector4i(i);
 	}
 
-	friend GSVector4i operator ~ (const GSVector4i& v)
+	__forceinline friend GSVector4i operator ~ (const GSVector4i& v)
 	{
 		return v ^ (v == v);
 	}
 
-	friend GSVector4i operator == (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator == (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return GSVector4i(_mm_cmpeq_epi32(v1, v2));
 	}
 
-	friend GSVector4i operator != (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator != (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return ~(v1 == v2);
 	}
 
-	friend GSVector4i operator > (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator > (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return GSVector4i(_mm_cmpgt_epi32(v1, v2));
 	}
 
-	friend GSVector4i operator < (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator < (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return GSVector4i(_mm_cmplt_epi32(v1, v2));
 	}
 
-	friend GSVector4i operator >= (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator >= (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return (v1 > v2) | (v1 == v2);
 	}
 
-	friend GSVector4i operator <= (const GSVector4i& v1, const GSVector4i& v2)
+	__forceinline friend GSVector4i operator <= (const GSVector4i& v1, const GSVector4i& v2)
 	{
 		return (v1 < v2) | (v1 == v2);
 	}
 
-	template<int i> GSVector4i shuffle() const
+	template<int i> __forceinline GSVector4i shuffle() const
 	{
 		return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(i, i, i, i)));
 	}
 
 	#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
-		GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
-		GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
-		GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
-		GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \
 
 	#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
 		VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
@@ -2041,9 +2041,9 @@ public:
 		VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
 
 	#define VECTOR4i_SHUFFLE_1(xs, xn) \
-		GSVector4i xs##4() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
-		GSVector4i xs##4l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
-		GSVector4i xs##4h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
+		__forceinline GSVector4i xs##4() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
+		__forceinline GSVector4i xs##4l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
+		__forceinline GSVector4i xs##4h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
 		VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
 		VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
 		VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
@@ -2054,203 +2054,203 @@ public:
 	VECTOR4i_SHUFFLE_1(z, 2)
 	VECTOR4i_SHUFFLE_1(w, 3)
 
-	static GSVector4i zero() {return GSVector4i(_mm_setzero_si128());}
+	__forceinline static GSVector4i zero() {return GSVector4i(_mm_setzero_si128());}
 
-	static GSVector4i xffffffff() {return zero() == zero();}
+	__forceinline static GSVector4i xffffffff() {return zero() == zero();}
 
-	static GSVector4i x00000001() {return xffffffff().srl32(31);}
-	static GSVector4i x00000003() {return xffffffff().srl32(30);}
-	static GSVector4i x00000007() {return xffffffff().srl32(29);}
-	static GSVector4i x0000000f() {return xffffffff().srl32(28);}
-	static GSVector4i x0000001f() {return xffffffff().srl32(27);}
-	static GSVector4i x0000003f() {return xffffffff().srl32(26);}
-	static GSVector4i x0000007f() {return xffffffff().srl32(25);}
-	static GSVector4i x000000ff() {return xffffffff().srl32(24);}
-	static GSVector4i x000001ff() {return xffffffff().srl32(23);}
-	static GSVector4i x000003ff() {return xffffffff().srl32(22);}
-	static GSVector4i x000007ff() {return xffffffff().srl32(21);}
-	static GSVector4i x00000fff() {return xffffffff().srl32(20);}
-	static GSVector4i x00001fff() {return xffffffff().srl32(19);}
-	static GSVector4i x00003fff() {return xffffffff().srl32(18);}
-	static GSVector4i x00007fff() {return xffffffff().srl32(17);}
-	static GSVector4i x0000ffff() {return xffffffff().srl32(16);}
-	static GSVector4i x0001ffff() {return xffffffff().srl32(15);}
-	static GSVector4i x0003ffff() {return xffffffff().srl32(14);}
-	static GSVector4i x0007ffff() {return xffffffff().srl32(13);}
-	static GSVector4i x000fffff() {return xffffffff().srl32(12);}
-	static GSVector4i x001fffff() {return xffffffff().srl32(11);}
-	static GSVector4i x003fffff() {return xffffffff().srl32(10);}
-	static GSVector4i x007fffff() {return xffffffff().srl32( 9);}
-	static GSVector4i x00ffffff() {return xffffffff().srl32( 8);}
-	static GSVector4i x01ffffff() {return xffffffff().srl32( 7);}
-	static GSVector4i x03ffffff() {return xffffffff().srl32( 6);}
-	static GSVector4i x07ffffff() {return xffffffff().srl32( 5);}
-	static GSVector4i x0fffffff() {return xffffffff().srl32( 4);}
-	static GSVector4i x1fffffff() {return xffffffff().srl32( 3);}
-	static GSVector4i x3fffffff() {return xffffffff().srl32( 2);}
-	static GSVector4i x7fffffff() {return xffffffff().srl32( 1);}
+	__forceinline static GSVector4i x00000001() {return xffffffff().srl32(31);}
+	__forceinline static GSVector4i x00000003() {return xffffffff().srl32(30);}
+	__forceinline static GSVector4i x00000007() {return xffffffff().srl32(29);}
+	__forceinline static GSVector4i x0000000f() {return xffffffff().srl32(28);}
+	__forceinline static GSVector4i x0000001f() {return xffffffff().srl32(27);}
+	__forceinline static GSVector4i x0000003f() {return xffffffff().srl32(26);}
+	__forceinline static GSVector4i x0000007f() {return xffffffff().srl32(25);}
+	__forceinline static GSVector4i x000000ff() {return xffffffff().srl32(24);}
+	__forceinline static GSVector4i x000001ff() {return xffffffff().srl32(23);}
+	__forceinline static GSVector4i x000003ff() {return xffffffff().srl32(22);}
+	__forceinline static GSVector4i x000007ff() {return xffffffff().srl32(21);}
+	__forceinline static GSVector4i x00000fff() {return xffffffff().srl32(20);}
+	__forceinline static GSVector4i x00001fff() {return xffffffff().srl32(19);}
+	__forceinline static GSVector4i x00003fff() {return xffffffff().srl32(18);}
+	__forceinline static GSVector4i x00007fff() {return xffffffff().srl32(17);}
+	__forceinline static GSVector4i x0000ffff() {return xffffffff().srl32(16);}
+	__forceinline static GSVector4i x0001ffff() {return xffffffff().srl32(15);}
+	__forceinline static GSVector4i x0003ffff() {return xffffffff().srl32(14);}
+	__forceinline static GSVector4i x0007ffff() {return xffffffff().srl32(13);}
+	__forceinline static GSVector4i x000fffff() {return xffffffff().srl32(12);}
+	__forceinline static GSVector4i x001fffff() {return xffffffff().srl32(11);}
+	__forceinline static GSVector4i x003fffff() {return xffffffff().srl32(10);}
+	__forceinline static GSVector4i x007fffff() {return xffffffff().srl32( 9);}
+	__forceinline static GSVector4i x00ffffff() {return xffffffff().srl32( 8);}
+	__forceinline static GSVector4i x01ffffff() {return xffffffff().srl32( 7);}
+	__forceinline static GSVector4i x03ffffff() {return xffffffff().srl32( 6);}
+	__forceinline static GSVector4i x07ffffff() {return xffffffff().srl32( 5);}
+	__forceinline static GSVector4i x0fffffff() {return xffffffff().srl32( 4);}
+	__forceinline static GSVector4i x1fffffff() {return xffffffff().srl32( 3);}
+	__forceinline static GSVector4i x3fffffff() {return xffffffff().srl32( 2);}
+	__forceinline static GSVector4i x7fffffff() {return xffffffff().srl32( 1);}
 
-	static GSVector4i x80000000() {return xffffffff().sll32(31);}
-	static GSVector4i xc0000000() {return xffffffff().sll32(30);}
-	static GSVector4i xe0000000() {return xffffffff().sll32(29);}
-	static GSVector4i xf0000000() {return xffffffff().sll32(28);}
-	static GSVector4i xf8000000() {return xffffffff().sll32(27);}
-	static GSVector4i xfc000000() {return xffffffff().sll32(26);}
-	static GSVector4i xfe000000() {return xffffffff().sll32(25);}
-	static GSVector4i xff000000() {return xffffffff().sll32(24);}
-	static GSVector4i xff800000() {return xffffffff().sll32(23);}
-	static GSVector4i xffc00000() {return xffffffff().sll32(22);}
-	static GSVector4i xffe00000() {return xffffffff().sll32(21);}
-	static GSVector4i xfff00000() {return xffffffff().sll32(20);}
-	static GSVector4i xfff80000() {return xffffffff().sll32(19);}
-	static GSVector4i xfffc0000() {return xffffffff().sll32(18);}
-	static GSVector4i xfffe0000() {return xffffffff().sll32(17);}
-	static GSVector4i xffff0000() {return xffffffff().sll32(16);}
-	static GSVector4i xffff8000() {return xffffffff().sll32(15);}
-	static GSVector4i xffffc000() {return xffffffff().sll32(14);}
-	static GSVector4i xffffe000() {return xffffffff().sll32(13);}
-	static GSVector4i xfffff000() {return xffffffff().sll32(12);}
-	static GSVector4i xfffff800() {return xffffffff().sll32(11);}
-	static GSVector4i xfffffc00() {return xffffffff().sll32(10);}
-	static GSVector4i xfffffe00() {return xffffffff().sll32( 9);}
-	static GSVector4i xffffff00() {return xffffffff().sll32( 8);}
-	static GSVector4i xffffff80() {return xffffffff().sll32( 7);}
-	static GSVector4i xffffffc0() {return xffffffff().sll32( 6);}
-	static GSVector4i xffffffe0() {return xffffffff().sll32( 5);}
-	static GSVector4i xfffffff0() {return xffffffff().sll32( 4);}
-	static GSVector4i xfffffff8() {return xffffffff().sll32( 3);}
-	static GSVector4i xfffffffc() {return xffffffff().sll32( 2);}
-	static GSVector4i xfffffffe() {return xffffffff().sll32( 1);}
+	__forceinline static GSVector4i x80000000() {return xffffffff().sll32(31);}
+	__forceinline static GSVector4i xc0000000() {return xffffffff().sll32(30);}
+	__forceinline static GSVector4i xe0000000() {return xffffffff().sll32(29);}
+	__forceinline static GSVector4i xf0000000() {return xffffffff().sll32(28);}
+	__forceinline static GSVector4i xf8000000() {return xffffffff().sll32(27);}
+	__forceinline static GSVector4i xfc000000() {return xffffffff().sll32(26);}
+	__forceinline static GSVector4i xfe000000() {return xffffffff().sll32(25);}
+	__forceinline static GSVector4i xff000000() {return xffffffff().sll32(24);}
+	__forceinline static GSVector4i xff800000() {return xffffffff().sll32(23);}
+	__forceinline static GSVector4i xffc00000() {return xffffffff().sll32(22);}
+	__forceinline static GSVector4i xffe00000() {return xffffffff().sll32(21);}
+	__forceinline static GSVector4i xfff00000() {return xffffffff().sll32(20);}
+	__forceinline static GSVector4i xfff80000() {return xffffffff().sll32(19);}
+	__forceinline static GSVector4i xfffc0000() {return xffffffff().sll32(18);}
+	__forceinline static GSVector4i xfffe0000() {return xffffffff().sll32(17);}
+	__forceinline static GSVector4i xffff0000() {return xffffffff().sll32(16);}
+	__forceinline static GSVector4i xffff8000() {return xffffffff().sll32(15);}
+	__forceinline static GSVector4i xffffc000() {return xffffffff().sll32(14);}
+	__forceinline static GSVector4i xffffe000() {return xffffffff().sll32(13);}
+	__forceinline static GSVector4i xfffff000() {return xffffffff().sll32(12);}
+	__forceinline static GSVector4i xfffff800() {return xffffffff().sll32(11);}
+	__forceinline static GSVector4i xfffffc00() {return xffffffff().sll32(10);}
+	__forceinline static GSVector4i xfffffe00() {return xffffffff().sll32( 9);}
+	__forceinline static GSVector4i xffffff00() {return xffffffff().sll32( 8);}
+	__forceinline static GSVector4i xffffff80() {return xffffffff().sll32( 7);}
+	__forceinline static GSVector4i xffffffc0() {return xffffffff().sll32( 6);}
+	__forceinline static GSVector4i xffffffe0() {return xffffffff().sll32( 5);}
+	__forceinline static GSVector4i xfffffff0() {return xffffffff().sll32( 4);}
+	__forceinline static GSVector4i xfffffff8() {return xffffffff().sll32( 3);}
+	__forceinline static GSVector4i xfffffffc() {return xffffffff().sll32( 2);}
+	__forceinline static GSVector4i xfffffffe() {return xffffffff().sll32( 1);}
 
-	static GSVector4i x0001() {return xffffffff().srl16(15);}
-	static GSVector4i x0003() {return xffffffff().srl16(14);}
-	static GSVector4i x0007() {return xffffffff().srl16(13);}
-	static GSVector4i x000f() {return xffffffff().srl16(12);}
-	static GSVector4i x001f() {return xffffffff().srl16(11);}
-	static GSVector4i x003f() {return xffffffff().srl16(10);}
-	static GSVector4i x007f() {return xffffffff().srl16( 9);}
-	static GSVector4i x00ff() {return xffffffff().srl16( 8);}
-	static GSVector4i x01ff() {return xffffffff().srl16( 7);}
-	static GSVector4i x03ff() {return xffffffff().srl16( 6);}
-	static GSVector4i x07ff() {return xffffffff().srl16( 5);}
-	static GSVector4i x0fff() {return xffffffff().srl16( 4);}
-	static GSVector4i x1fff() {return xffffffff().srl16( 3);}
-	static GSVector4i x3fff() {return xffffffff().srl16( 2);}
-	static GSVector4i x7fff() {return xffffffff().srl16( 1);}
+	__forceinline static GSVector4i x0001() {return xffffffff().srl16(15);}
+	__forceinline static GSVector4i x0003() {return xffffffff().srl16(14);}
+	__forceinline static GSVector4i x0007() {return xffffffff().srl16(13);}
+	__forceinline static GSVector4i x000f() {return xffffffff().srl16(12);}
+	__forceinline static GSVector4i x001f() {return xffffffff().srl16(11);}
+	__forceinline static GSVector4i x003f() {return xffffffff().srl16(10);}
+	__forceinline static GSVector4i x007f() {return xffffffff().srl16( 9);}
+	__forceinline static GSVector4i x00ff() {return xffffffff().srl16( 8);}
+	__forceinline static GSVector4i x01ff() {return xffffffff().srl16( 7);}
+	__forceinline static GSVector4i x03ff() {return xffffffff().srl16( 6);}
+	__forceinline static GSVector4i x07ff() {return xffffffff().srl16( 5);}
+	__forceinline static GSVector4i x0fff() {return xffffffff().srl16( 4);}
+	__forceinline static GSVector4i x1fff() {return xffffffff().srl16( 3);}
+	__forceinline static GSVector4i x3fff() {return xffffffff().srl16( 2);}
+	__forceinline static GSVector4i x7fff() {return xffffffff().srl16( 1);}
 
-	static GSVector4i x8000() {return xffffffff().sll16(15);}
-	static GSVector4i xc000() {return xffffffff().sll16(14);}
-	static GSVector4i xe000() {return xffffffff().sll16(13);}
-	static GSVector4i xf000() {return xffffffff().sll16(12);}
-	static GSVector4i xf800() {return xffffffff().sll16(11);}
-	static GSVector4i xfc00() {return xffffffff().sll16(10);}
-	static GSVector4i xfe00() {return xffffffff().sll16( 9);}
-	static GSVector4i xff00() {return xffffffff().sll16( 8);}
-	static GSVector4i xff80() {return xffffffff().sll16( 7);}
-	static GSVector4i xffc0() {return xffffffff().sll16( 6);}
-	static GSVector4i xffe0() {return xffffffff().sll16( 5);}
-	static GSVector4i xfff0() {return xffffffff().sll16( 4);}
-	static GSVector4i xfff8() {return xffffffff().sll16( 3);}
-	static GSVector4i xfffc() {return xffffffff().sll16( 2);}
-	static GSVector4i xfffe() {return xffffffff().sll16( 1);}
+	__forceinline static GSVector4i x8000() {return xffffffff().sll16(15);}
+	__forceinline static GSVector4i xc000() {return xffffffff().sll16(14);}
+	__forceinline static GSVector4i xe000() {return xffffffff().sll16(13);}
+	__forceinline static GSVector4i xf000() {return xffffffff().sll16(12);}
+	__forceinline static GSVector4i xf800() {return xffffffff().sll16(11);}
+	__forceinline static GSVector4i xfc00() {return xffffffff().sll16(10);}
+	__forceinline static GSVector4i xfe00() {return xffffffff().sll16( 9);}
+	__forceinline static GSVector4i xff00() {return xffffffff().sll16( 8);}
+	__forceinline static GSVector4i xff80() {return xffffffff().sll16( 7);}
+	__forceinline static GSVector4i xffc0() {return xffffffff().sll16( 6);}
+	__forceinline static GSVector4i xffe0() {return xffffffff().sll16( 5);}
+	__forceinline static GSVector4i xfff0() {return xffffffff().sll16( 4);}
+	__forceinline static GSVector4i xfff8() {return xffffffff().sll16( 3);}
+	__forceinline static GSVector4i xfffc() {return xffffffff().sll16( 2);}
+	__forceinline static GSVector4i xfffe() {return xffffffff().sll16( 1);}
 
-	static GSVector4i xffffffff(const GSVector4i& v) {return v == v;}
+	__forceinline static GSVector4i xffffffff(const GSVector4i& v) {return v == v;}
 
-	static GSVector4i x00000001(const GSVector4i& v) {return xffffffff(v).srl32(31);}
-	static GSVector4i x00000003(const GSVector4i& v) {return xffffffff(v).srl32(30);}
-	static GSVector4i x00000007(const GSVector4i& v) {return xffffffff(v).srl32(29);}
-	static GSVector4i x0000000f(const GSVector4i& v) {return xffffffff(v).srl32(28);}
-	static GSVector4i x0000001f(const GSVector4i& v) {return xffffffff(v).srl32(27);}
-	static GSVector4i x0000003f(const GSVector4i& v) {return xffffffff(v).srl32(26);}
-	static GSVector4i x0000007f(const GSVector4i& v) {return xffffffff(v).srl32(25);}
-	static GSVector4i x000000ff(const GSVector4i& v) {return xffffffff(v).srl32(24);}
-	static GSVector4i x000001ff(const GSVector4i& v) {return xffffffff(v).srl32(23);}
-	static GSVector4i x000003ff(const GSVector4i& v) {return xffffffff(v).srl32(22);}
-	static GSVector4i x000007ff(const GSVector4i& v) {return xffffffff(v).srl32(21);}
-	static GSVector4i x00000fff(const GSVector4i& v) {return xffffffff(v).srl32(20);}
-	static GSVector4i x00001fff(const GSVector4i& v) {return xffffffff(v).srl32(19);}
-	static GSVector4i x00003fff(const GSVector4i& v) {return xffffffff(v).srl32(18);}
-	static GSVector4i x00007fff(const GSVector4i& v) {return xffffffff(v).srl32(17);}
-	static GSVector4i x0000ffff(const GSVector4i& v) {return xffffffff(v).srl32(16);}
-	static GSVector4i x0001ffff(const GSVector4i& v) {return xffffffff(v).srl32(15);}
-	static GSVector4i x0003ffff(const GSVector4i& v) {return xffffffff(v).srl32(14);}
-	static GSVector4i x0007ffff(const GSVector4i& v) {return xffffffff(v).srl32(13);}
-	static GSVector4i x000fffff(const GSVector4i& v) {return xffffffff(v).srl32(12);}
-	static GSVector4i x001fffff(const GSVector4i& v) {return xffffffff(v).srl32(11);}
-	static GSVector4i x003fffff(const GSVector4i& v) {return xffffffff(v).srl32(10);}
-	static GSVector4i x007fffff(const GSVector4i& v) {return xffffffff(v).srl32( 9);}
-	static GSVector4i x00ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 8);}
-	static GSVector4i x01ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 7);}
-	static GSVector4i x03ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 6);}
-	static GSVector4i x07ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 5);}
-	static GSVector4i x0fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 4);}
-	static GSVector4i x1fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 3);}
-	static GSVector4i x3fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 2);}
-	static GSVector4i x7fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 1);}
+	__forceinline static GSVector4i x00000001(const GSVector4i& v) {return xffffffff(v).srl32(31);}
+	__forceinline static GSVector4i x00000003(const GSVector4i& v) {return xffffffff(v).srl32(30);}
+	__forceinline static GSVector4i x00000007(const GSVector4i& v) {return xffffffff(v).srl32(29);}
+	__forceinline static GSVector4i x0000000f(const GSVector4i& v) {return xffffffff(v).srl32(28);}
+	__forceinline static GSVector4i x0000001f(const GSVector4i& v) {return xffffffff(v).srl32(27);}
+	__forceinline static GSVector4i x0000003f(const GSVector4i& v) {return xffffffff(v).srl32(26);}
+	__forceinline static GSVector4i x0000007f(const GSVector4i& v) {return xffffffff(v).srl32(25);}
+	__forceinline static GSVector4i x000000ff(const GSVector4i& v) {return xffffffff(v).srl32(24);}
+	__forceinline static GSVector4i x000001ff(const GSVector4i& v) {return xffffffff(v).srl32(23);}
+	__forceinline static GSVector4i x000003ff(const GSVector4i& v) {return xffffffff(v).srl32(22);}
+	__forceinline static GSVector4i x000007ff(const GSVector4i& v) {return xffffffff(v).srl32(21);}
+	__forceinline static GSVector4i x00000fff(const GSVector4i& v) {return xffffffff(v).srl32(20);}
+	__forceinline static GSVector4i x00001fff(const GSVector4i& v) {return xffffffff(v).srl32(19);}
+	__forceinline static GSVector4i x00003fff(const GSVector4i& v) {return xffffffff(v).srl32(18);}
+	__forceinline static GSVector4i x00007fff(const GSVector4i& v) {return xffffffff(v).srl32(17);}
+	__forceinline static GSVector4i x0000ffff(const GSVector4i& v) {return xffffffff(v).srl32(16);}
+	__forceinline static GSVector4i x0001ffff(const GSVector4i& v) {return xffffffff(v).srl32(15);}
+	__forceinline static GSVector4i x0003ffff(const GSVector4i& v) {return xffffffff(v).srl32(14);}
+	__forceinline static GSVector4i x0007ffff(const GSVector4i& v) {return xffffffff(v).srl32(13);}
+	__forceinline static GSVector4i x000fffff(const GSVector4i& v) {return xffffffff(v).srl32(12);}
+	__forceinline static GSVector4i x001fffff(const GSVector4i& v) {return xffffffff(v).srl32(11);}
+	__forceinline static GSVector4i x003fffff(const GSVector4i& v) {return xffffffff(v).srl32(10);}
+	__forceinline static GSVector4i x007fffff(const GSVector4i& v) {return xffffffff(v).srl32( 9);}
+	__forceinline static GSVector4i x00ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 8);}
+	__forceinline static GSVector4i x01ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 7);}
+	__forceinline static GSVector4i x03ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 6);}
+	__forceinline static GSVector4i x07ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 5);}
+	__forceinline static GSVector4i x0fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 4);}
+	__forceinline static GSVector4i x1fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 3);}
+	__forceinline static GSVector4i x3fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 2);}
+	__forceinline static GSVector4i x7fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 1);}
 
-	static GSVector4i x80000000(const GSVector4i& v) {return xffffffff(v).sll32(31);}
-	static GSVector4i xc0000000(const GSVector4i& v) {return xffffffff(v).sll32(30);}
-	static GSVector4i xe0000000(const GSVector4i& v) {return xffffffff(v).sll32(29);}
-	static GSVector4i xf0000000(const GSVector4i& v) {return xffffffff(v).sll32(28);}
-	static GSVector4i xf8000000(const GSVector4i& v) {return xffffffff(v).sll32(27);}
-	static GSVector4i xfc000000(const GSVector4i& v) {return xffffffff(v).sll32(26);}
-	static GSVector4i xfe000000(const GSVector4i& v) {return xffffffff(v).sll32(25);}
-	static GSVector4i xff000000(const GSVector4i& v) {return xffffffff(v).sll32(24);}
-	static GSVector4i xff800000(const GSVector4i& v) {return xffffffff(v).sll32(23);}
-	static GSVector4i xffc00000(const GSVector4i& v) {return xffffffff(v).sll32(22);}
-	static GSVector4i xffe00000(const GSVector4i& v) {return xffffffff(v).sll32(21);}
-	static GSVector4i xfff00000(const GSVector4i& v) {return xffffffff(v).sll32(20);}
-	static GSVector4i xfff80000(const GSVector4i& v) {return xffffffff(v).sll32(19);}
-	static GSVector4i xfffc0000(const GSVector4i& v) {return xffffffff(v).sll32(18);}
-	static GSVector4i xfffe0000(const GSVector4i& v) {return xffffffff(v).sll32(17);}
-	static GSVector4i xffff0000(const GSVector4i& v) {return xffffffff(v).sll32(16);}
-	static GSVector4i xffff8000(const GSVector4i& v) {return xffffffff(v).sll32(15);}
-	static GSVector4i xffffc000(const GSVector4i& v) {return xffffffff(v).sll32(14);}
-	static GSVector4i xffffe000(const GSVector4i& v) {return xffffffff(v).sll32(13);}
-	static GSVector4i xfffff000(const GSVector4i& v) {return xffffffff(v).sll32(12);}
-	static GSVector4i xfffff800(const GSVector4i& v) {return xffffffff(v).sll32(11);}
-	static GSVector4i xfffffc00(const GSVector4i& v) {return xffffffff(v).sll32(10);}
-	static GSVector4i xfffffe00(const GSVector4i& v) {return xffffffff(v).sll32( 9);}
-	static GSVector4i xffffff00(const GSVector4i& v) {return xffffffff(v).sll32( 8);}
-	static GSVector4i xffffff80(const GSVector4i& v) {return xffffffff(v).sll32( 7);}
-	static GSVector4i xffffffc0(const GSVector4i& v) {return xffffffff(v).sll32( 6);}
-	static GSVector4i xffffffe0(const GSVector4i& v) {return xffffffff(v).sll32( 5);}
-	static GSVector4i xfffffff0(const GSVector4i& v) {return xffffffff(v).sll32( 4);}
-	static GSVector4i xfffffff8(const GSVector4i& v) {return xffffffff(v).sll32( 3);}
-	static GSVector4i xfffffffc(const GSVector4i& v) {return xffffffff(v).sll32( 2);}
-	static GSVector4i xfffffffe(const GSVector4i& v) {return xffffffff(v).sll32( 1);}
+	__forceinline static GSVector4i x80000000(const GSVector4i& v) {return xffffffff(v).sll32(31);}
+	__forceinline static GSVector4i xc0000000(const GSVector4i& v) {return xffffffff(v).sll32(30);}
+	__forceinline static GSVector4i xe0000000(const GSVector4i& v) {return xffffffff(v).sll32(29);}
+	__forceinline static GSVector4i xf0000000(const GSVector4i& v) {return xffffffff(v).sll32(28);}
+	__forceinline static GSVector4i xf8000000(const GSVector4i& v) {return xffffffff(v).sll32(27);}
+	__forceinline static GSVector4i xfc000000(const GSVector4i& v) {return xffffffff(v).sll32(26);}
+	__forceinline static GSVector4i xfe000000(const GSVector4i& v) {return xffffffff(v).sll32(25);}
+	__forceinline static GSVector4i xff000000(const GSVector4i& v) {return xffffffff(v).sll32(24);}
+	__forceinline static GSVector4i xff800000(const GSVector4i& v) {return xffffffff(v).sll32(23);}
+	__forceinline static GSVector4i xffc00000(const GSVector4i& v) {return xffffffff(v).sll32(22);}
+	__forceinline static GSVector4i xffe00000(const GSVector4i& v) {return xffffffff(v).sll32(21);}
+	__forceinline static GSVector4i xfff00000(const GSVector4i& v) {return xffffffff(v).sll32(20);}
+	__forceinline static GSVector4i xfff80000(const GSVector4i& v) {return xffffffff(v).sll32(19);}
+	__forceinline static GSVector4i xfffc0000(const GSVector4i& v) {return xffffffff(v).sll32(18);}
+	__forceinline static GSVector4i xfffe0000(const GSVector4i& v) {return xffffffff(v).sll32(17);}
+	__forceinline static GSVector4i xffff0000(const GSVector4i& v) {return xffffffff(v).sll32(16);}
+	__forceinline static GSVector4i xffff8000(const GSVector4i& v) {return xffffffff(v).sll32(15);}
+	__forceinline static GSVector4i xffffc000(const GSVector4i& v) {return xffffffff(v).sll32(14);}
+	__forceinline static GSVector4i xffffe000(const GSVector4i& v) {return xffffffff(v).sll32(13);}
+	__forceinline static GSVector4i xfffff000(const GSVector4i& v) {return xffffffff(v).sll32(12);}
+	__forceinline static GSVector4i xfffff800(const GSVector4i& v) {return xffffffff(v).sll32(11);}
+	__forceinline static GSVector4i xfffffc00(const GSVector4i& v) {return xffffffff(v).sll32(10);}
+	__forceinline static GSVector4i xfffffe00(const GSVector4i& v) {return xffffffff(v).sll32( 9);}
+	__forceinline static GSVector4i xffffff00(const GSVector4i& v) {return xffffffff(v).sll32( 8);}
+	__forceinline static GSVector4i xffffff80(const GSVector4i& v) {return xffffffff(v).sll32( 7);}
+	__forceinline static GSVector4i xffffffc0(const GSVector4i& v) {return xffffffff(v).sll32( 6);}
+	__forceinline static GSVector4i xffffffe0(const GSVector4i& v) {return xffffffff(v).sll32( 5);}
+	__forceinline static GSVector4i xfffffff0(const GSVector4i& v) {return xffffffff(v).sll32( 4);}
+	__forceinline static GSVector4i xfffffff8(const GSVector4i& v) {return xffffffff(v).sll32( 3);}
+	__forceinline static GSVector4i xfffffffc(const GSVector4i& v) {return xffffffff(v).sll32( 2);}
+	__forceinline static GSVector4i xfffffffe(const GSVector4i& v) {return xffffffff(v).sll32( 1);}
 
-	static GSVector4i x0001(const GSVector4i& v) {return xffffffff(v).srl16(15);}
-	static GSVector4i x0003(const GSVector4i& v) {return xffffffff(v).srl16(14);}
-	static GSVector4i x0007(const GSVector4i& v) {return xffffffff(v).srl16(13);}
-	static GSVector4i x000f(const GSVector4i& v) {return xffffffff(v).srl16(12);}
-	static GSVector4i x001f(const GSVector4i& v) {return xffffffff(v).srl16(11);}
-	static GSVector4i x003f(const GSVector4i& v) {return xffffffff(v).srl16(10);}
-	static GSVector4i x007f(const GSVector4i& v) {return xffffffff(v).srl16( 9);}
-	static GSVector4i x00ff(const GSVector4i& v) {return xffffffff(v).srl16( 8);}
-	static GSVector4i x01ff(const GSVector4i& v) {return xffffffff(v).srl16( 7);}
-	static GSVector4i x03ff(const GSVector4i& v) {return xffffffff(v).srl16( 6);}
-	static GSVector4i x07ff(const GSVector4i& v) {return xffffffff(v).srl16( 5);}
-	static GSVector4i x0fff(const GSVector4i& v) {return xffffffff(v).srl16( 4);}
-	static GSVector4i x1fff(const GSVector4i& v) {return xffffffff(v).srl16( 3);}
-	static GSVector4i x3fff(const GSVector4i& v) {return xffffffff(v).srl16( 2);}
-	static GSVector4i x7fff(const GSVector4i& v) {return xffffffff(v).srl16( 1);}
+	__forceinline static GSVector4i x0001(const GSVector4i& v) {return xffffffff(v).srl16(15);}
+	__forceinline static GSVector4i x0003(const GSVector4i& v) {return xffffffff(v).srl16(14);}
+	__forceinline static GSVector4i x0007(const GSVector4i& v) {return xffffffff(v).srl16(13);}
+	__forceinline static GSVector4i x000f(const GSVector4i& v) {return xffffffff(v).srl16(12);}
+	__forceinline static GSVector4i x001f(const GSVector4i& v) {return xffffffff(v).srl16(11);}
+	__forceinline static GSVector4i x003f(const GSVector4i& v) {return xffffffff(v).srl16(10);}
+	__forceinline static GSVector4i x007f(const GSVector4i& v) {return xffffffff(v).srl16( 9);}
+	__forceinline static GSVector4i x00ff(const GSVector4i& v) {return xffffffff(v).srl16( 8);}
+	__forceinline static GSVector4i x01ff(const GSVector4i& v) {return xffffffff(v).srl16( 7);}
+	__forceinline static GSVector4i x03ff(const GSVector4i& v) {return xffffffff(v).srl16( 6);}
+	__forceinline static GSVector4i x07ff(const GSVector4i& v) {return xffffffff(v).srl16( 5);}
+	__forceinline static GSVector4i x0fff(const GSVector4i& v) {return xffffffff(v).srl16( 4);}
+	__forceinline static GSVector4i x1fff(const GSVector4i& v) {return xffffffff(v).srl16( 3);}
+	__forceinline static GSVector4i x3fff(const GSVector4i& v) {return xffffffff(v).srl16( 2);}
+	__forceinline static GSVector4i x7fff(const GSVector4i& v) {return xffffffff(v).srl16( 1);}
 
-	static GSVector4i x8000(const GSVector4i& v) {return xffffffff(v).sll16(15);}
-	static GSVector4i xc000(const GSVector4i& v) {return xffffffff(v).sll16(14);}
-	static GSVector4i xe000(const GSVector4i& v) {return xffffffff(v).sll16(13);}
-	static GSVector4i xf000(const GSVector4i& v) {return xffffffff(v).sll16(12);}
-	static GSVector4i xf800(const GSVector4i& v) {return xffffffff(v).sll16(11);}
-	static GSVector4i xfc00(const GSVector4i& v) {return xffffffff(v).sll16(10);}
-	static GSVector4i xfe00(const GSVector4i& v) {return xffffffff(v).sll16( 9);}
-	static GSVector4i xff00(const GSVector4i& v) {return xffffffff(v).sll16( 8);}
-	static GSVector4i xff80(const GSVector4i& v) {return xffffffff(v).sll16( 7);}
-	static GSVector4i xffc0(const GSVector4i& v) {return xffffffff(v).sll16( 6);}
-	static GSVector4i xffe0(const GSVector4i& v) {return xffffffff(v).sll16( 5);}
-	static GSVector4i xfff0(const GSVector4i& v) {return xffffffff(v).sll16( 4);}
-	static GSVector4i xfff8(const GSVector4i& v) {return xffffffff(v).sll16( 3);}
-	static GSVector4i xfffc(const GSVector4i& v) {return xffffffff(v).sll16( 2);}
-	static GSVector4i xfffe(const GSVector4i& v) {return xffffffff(v).sll16( 1);}
+	__forceinline static GSVector4i x8000(const GSVector4i& v) {return xffffffff(v).sll16(15);}
+	__forceinline static GSVector4i xc000(const GSVector4i& v) {return xffffffff(v).sll16(14);}
+	__forceinline static GSVector4i xe000(const GSVector4i& v) {return xffffffff(v).sll16(13);}
+	__forceinline static GSVector4i xf000(const GSVector4i& v) {return xffffffff(v).sll16(12);}
+	__forceinline static GSVector4i xf800(const GSVector4i& v) {return xffffffff(v).sll16(11);}
+	__forceinline static GSVector4i xfc00(const GSVector4i& v) {return xffffffff(v).sll16(10);}
+	__forceinline static GSVector4i xfe00(const GSVector4i& v) {return xffffffff(v).sll16( 9);}
+	__forceinline static GSVector4i xff00(const GSVector4i& v) {return xffffffff(v).sll16( 8);}
+	__forceinline static GSVector4i xff80(const GSVector4i& v) {return xffffffff(v).sll16( 7);}
+	__forceinline static GSVector4i xffc0(const GSVector4i& v) {return xffffffff(v).sll16( 6);}
+	__forceinline static GSVector4i xffe0(const GSVector4i& v) {return xffffffff(v).sll16( 5);}
+	__forceinline static GSVector4i xfff0(const GSVector4i& v) {return xffffffff(v).sll16( 4);}
+	__forceinline static GSVector4i xfff8(const GSVector4i& v) {return xffffffff(v).sll16( 3);}
+	__forceinline static GSVector4i xfffc(const GSVector4i& v) {return xffffffff(v).sll16( 2);}
+	__forceinline static GSVector4i xfffe(const GSVector4i& v) {return xffffffff(v).sll16( 1);}
 };
 
 __aligned16 class GSVector4
@@ -2280,121 +2280,121 @@ public:
 	static const GSVector4 m_x3f800000;
 	static const GSVector4 m_x4b000000;
 
-	GSVector4()
+	__forceinline GSVector4()
 	{
 	}
 
-	GSVector4(float x, float y, float z, float w)
+	__forceinline GSVector4(float x, float y, float z, float w)
 	{
 		m = _mm_set_ps(w, z, y, x);
 	}
 
-	GSVector4(float x, float y)
+	__forceinline GSVector4(float x, float y)
 	{
 		m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y));
 	}
 
-	GSVector4(int x, int y, int z, int w)
+	__forceinline GSVector4(int x, int y, int z, int w)
 	{
 		GSVector4i v(x, y, z, w);
 
 		m = _mm_cvtepi32_ps(v.m);
 	}
 
-	GSVector4(int x, int y)
+	__forceinline GSVector4(int x, int y)
 	{
 		m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
 	}
 
-	GSVector4(const GSVector4& v)
+	__forceinline GSVector4(const GSVector4& v)
 	{
 		m = v.m;
 	}
 
-	explicit GSVector4(const GSVector2& v)
+	__forceinline explicit GSVector4(const GSVector2& v)
 	{
 		m = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&v));
 	}
 
-	explicit GSVector4(const GSVector2i& v)
+	__forceinline explicit GSVector4(const GSVector2i& v)
 	{
 		m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v));
 	}
 
-	explicit GSVector4(float f)
+	__forceinline explicit GSVector4(float f)
 	{
 		m = _mm_set1_ps(f);
 	}
 
-	explicit GSVector4(__m128 m)
+	__forceinline explicit GSVector4(__m128 m)
 	{
 		this->m = m;
 	}
 
-	explicit GSVector4(uint32 u32)
+	__forceinline explicit GSVector4(uint32 u32)
 	{
 		*this = GSVector4(GSVector4i::load((int)u32).u8to32());
 	}
 
-	explicit GSVector4(const GSVector4i& v);
+	__forceinline explicit GSVector4(const GSVector4i& v);
 
-	void operator = (const GSVector4& v)
+	__forceinline void operator = (const GSVector4& v)
 	{
 		m = v.m;
 	}
 
-	void operator = (float f)
+	__forceinline void operator = (float f)
 	{
 		m = _mm_set1_ps(f);
 	}
 
-	void operator = (__m128 m)
+	__forceinline void operator = (__m128 m)
 	{
 		this->m = m;
 	}
 
-	void operator = (uint32 u32)
+	__forceinline void operator = (uint32 u32)
 	{
 		*this = GSVector4(GSVector4i::load((int)u32).u8to32());
 	}
 
-	operator __m128() const
+	__forceinline operator __m128() const
 	{
 		return m;
 	}
 
-	uint32 rgba32() const
+	__forceinline uint32 rgba32() const
 	{
 		return GSVector4i(*this).rgba32();
 	}
 
-	static GSVector4 cast(const GSVector4i& v);
+	__forceinline static GSVector4 cast(const GSVector4i& v);
 
-	GSVector4 abs() const
+	__forceinline GSVector4 abs() const
 	{
 		return *this & cast(GSVector4i::x7fffffff());
 	}
 
-	GSVector4 neg() const
+	__forceinline GSVector4 neg() const
 	{
 		return *this ^ cast(GSVector4i::x80000000());
 	}
 
-	GSVector4 rcp() const
+	__forceinline GSVector4 rcp() const
 	{
 		return GSVector4(_mm_rcp_ps(m));
 	}
 
-	GSVector4 rcpnr() const
+	__forceinline GSVector4 rcpnr() const
 	{
 		GSVector4 v = rcp();
 
 		return (v + v) - (v * v) * *this;
 	}
 
-	enum RoundMode {NearestInt = 8, NegInf = 9, PosInf = 10};
+	enum RoundMode {NearestInt = 8, NegInf = 9, PosInf = 10, Truncate = 11};
 
-	template<int mode> GSVector4 round() const
+	template<int mode> __forceinline GSVector4 round() const
 	{
 		#if _M_SSE >= 0x401
 
@@ -2425,57 +2425,57 @@ public:
 		#endif
 	}
 
-	GSVector4 floor() const
+	__forceinline GSVector4 floor() const
 	{
 		return round<NegInf>();
 	}
 
-	GSVector4 ceil() const
+	__forceinline GSVector4 ceil() const
 	{
 		return round<PosInf>();
 	}
 
-	GSVector4 mod2x(const GSVector4& f, const int scale = 256) const
+	__forceinline GSVector4 mod2x(const GSVector4& f, const int scale = 256) const
 	{
 		return *this * (f * (2.0f / scale));
 	}
 
-	GSVector4 mod2x(float f, const int scale = 256) const
+	__forceinline GSVector4 mod2x(float f, const int scale = 256) const
 	{
 		return mod2x(GSVector4(f), scale);
 	}
 
-	GSVector4 madd(const GSVector4& a, const GSVector4& b) const
+	__forceinline GSVector4 madd(const GSVector4& a, const GSVector4& b) const
 	{
 		return *this * a + b; // TODO: _mm_fmadd_ps
 	}
 
-	GSVector4 msub(const GSVector4& a, const GSVector4& b) const
+	__forceinline GSVector4 msub(const GSVector4& a, const GSVector4& b) const
 	{
 		return *this * a + b; // TODO: _mm_fmsub_ps
 	}
 
-	GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const
+	__forceinline GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const
 	{
 		return b - *this * a; // TODO: _mm_fnmadd_ps
 	}
 
-	GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const
+	__forceinline GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const
 	{
 		return -b - *this * a; // TODO: _mm_fmnsub_ps
 	}
 
-	GSVector4 lerp(const GSVector4& v, const GSVector4& f) const
+	__forceinline GSVector4 lerp(const GSVector4& v, const GSVector4& f) const
 	{
 		return *this + (v - *this) * f;
 	}
 
-	GSVector4 lerp(const GSVector4& v, float f) const
+	__forceinline GSVector4 lerp(const GSVector4& v, float f) const
 	{
 		return lerp(v, GSVector4(f));
 	}
 
-	GSVector4 hadd() const
+	__forceinline GSVector4 hadd() const
 	{
 		#if _M_SSE >= 0x300
 		return GSVector4(_mm_hadd_ps(m, m));
@@ -2484,7 +2484,7 @@ public:
 		#endif
 	}
 
-	GSVector4 hadd(const GSVector4& v) const
+	__forceinline GSVector4 hadd(const GSVector4& v) const
 	{
 		#if _M_SSE >= 0x300
 		return GSVector4(_mm_hadd_ps(m, v.m));
@@ -2494,43 +2494,45 @@ public:
 	}
 
 	#if _M_SSE >= 0x401
-	template<int i> GSVector4 dp(const GSVector4& v) const
+
+	template<int i> __forceinline GSVector4 dp(const GSVector4& v) const
 	{
 		return GSVector4(_mm_dp_ps(m, v.m, i));
 	}
+
 	#endif
 
-	GSVector4 sat(const GSVector4& a, const GSVector4& b) const
+	__forceinline GSVector4 sat(const GSVector4& a, const GSVector4& b) const
 	{
 		return GSVector4(_mm_min_ps(_mm_max_ps(m, a), b));
 	}
 
-	GSVector4 sat(const GSVector4& a) const
+	__forceinline GSVector4 sat(const GSVector4& a) const
 	{
 		return GSVector4(_mm_min_ps(_mm_max_ps(m, a.xyxy()), a.zwzw()));
 	}
 
-	GSVector4 sat(const float scale = 255) const
+	__forceinline GSVector4 sat(const float scale = 255) const
 	{
 		return sat(zero(), GSVector4(scale));
 	}
 
-	GSVector4 clamp(const float scale = 255) const
+	__forceinline GSVector4 clamp(const float scale = 255) const
 	{
 		return min(GSVector4(scale));
 	}
 
-	GSVector4 min(const GSVector4& a) const
+	__forceinline GSVector4 min(const GSVector4& a) const
 	{
 		return GSVector4(_mm_min_ps(m, a));
 	}
 
-	GSVector4 max(const GSVector4& a) const
+	__forceinline GSVector4 max(const GSVector4& a) const
 	{
 		return GSVector4(_mm_max_ps(m, a));
 	}
 
-	GSVector4 blend8(const GSVector4& a, const GSVector4& mask)  const
+	__forceinline GSVector4 blend8(const GSVector4& a, const GSVector4& mask)  const
 	{
 		#if _M_SSE >= 0x401
 
@@ -2543,42 +2545,42 @@ public:
 		#endif
 	}
 
-	GSVector4 upl(const GSVector4& a) const
+	__forceinline GSVector4 upl(const GSVector4& a) const
 	{
 		return GSVector4(_mm_unpacklo_ps(m, a));
 	}
 
-	GSVector4 uph(const GSVector4& a) const
+	__forceinline GSVector4 uph(const GSVector4& a) const
 	{
 		return GSVector4(_mm_unpackhi_ps(m, a));
 	}
 
-	GSVector4 l2h(const GSVector4& a) const
+	__forceinline GSVector4 l2h(const GSVector4& a) const
 	{
 		return GSVector4(_mm_movelh_ps(m, a));
 	}
 
-	GSVector4 h2l(const GSVector4& a) const
+	__forceinline GSVector4 h2l(const GSVector4& a) const
 	{
 		return GSVector4(_mm_movehl_ps(m, a));
 	}
 
-	GSVector4 andnot(const GSVector4& v) const
+	__forceinline GSVector4 andnot(const GSVector4& v) const
 	{
 		return GSVector4(_mm_andnot_ps(v.m, m));
 	}
 
-	int mask() const
+	__forceinline int mask() const
 	{
 		return _mm_movemask_ps(m);
 	}
 
-	bool alltrue() const
+	__forceinline bool alltrue() const
 	{
 		return _mm_movemask_ps(m) == 0xf;
 	}
 
-	bool allfalse() const
+	__forceinline bool allfalse() const
 	{
 		#if _M_SSE >= 0x401
 		__m128i a = _mm_castps_si128(m);
@@ -2590,7 +2592,7 @@ public:
 
 	// TODO: insert
 
-	template<int i> int extract() const
+	template<int i> __forceinline int extract() const
 	{
 		#if _M_SSE >= 0x401
 		return _mm_extract_ps(m, i);
@@ -2599,47 +2601,47 @@ public:
 		#endif
 	}
 
-	static GSVector4 zero()
+	__forceinline static GSVector4 zero()
 	{
 		return GSVector4(_mm_setzero_ps());
 	}
 
-	static GSVector4 xffffffff()
+	__forceinline static GSVector4 xffffffff()
 	{
 		return zero() == zero();
 	}
 
-	static GSVector4 ps0123()
+	__forceinline static GSVector4 ps0123()
 	{
 		return GSVector4(m_ps0123);
 	}
 
-	static GSVector4 ps4567()
+	__forceinline static GSVector4 ps4567()
 	{
 		return GSVector4(m_ps4567);
 	}
 
-	static GSVector4 loadl(const void* p)
+	__forceinline static GSVector4 loadl(const void* p)
 	{
 		return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p)));
 	}
 
-	static GSVector4 load(float f)
+	__forceinline static GSVector4 load(float f)
 	{
 		return GSVector4(_mm_load_ss(&f));
 	}
 
-	template<bool aligned> static GSVector4 load(const void* p)
+	template<bool aligned> __forceinline static GSVector4 load(const void* p)
 	{
-		return GSVector4i(aligned ? _mm_load_ps((__m128*)p) : _mm_loadu_ps((__m128*)p));
+		return GSVector4(aligned ? _mm_load_ps((__m128*)p) : _mm_loadu_ps((__m128*)p));
 	}
 
-	static void storel(void* p, const GSVector4& v)
+	__forceinline static void storel(void* p, const GSVector4& v)
 	{
 		_mm_store_sd((double*)p, _mm_castps_pd(v.m));
 	}
 
-	template<bool aligned> static void store(void* p, const GSVector4& v)
+	template<bool aligned> __forceinline static void store(void* p, const GSVector4& v)
 	{
 		if(aligned) _mm_store_ps((__m128*)p, v.m);
 		else _mm_storeu_ps((__m128*)p, v.m);
@@ -2696,159 +2698,159 @@ public:
 		d = v3.h2l(v1);
 */	}
 
-	GSVector4 operator - () const
+	__forceinline GSVector4 operator - () const
 	{
 		return neg();
 	}
 
-	void operator += (const GSVector4& v)
+	__forceinline void operator += (const GSVector4& v)
 	{
 		m = _mm_add_ps(m, v);
 	}
 
-	void operator -= (const GSVector4& v)
+	__forceinline void operator -= (const GSVector4& v)
 	{
 		m = _mm_sub_ps(m, v);
 	}
 
-	void operator *= (const GSVector4& v)
+	__forceinline void operator *= (const GSVector4& v)
 	{
 		m = _mm_mul_ps(m, v);
 	}
 
-	void operator /= (const GSVector4& v)
+	__forceinline void operator /= (const GSVector4& v)
 	{
 		m = _mm_div_ps(m, v);
 	}
 
-	void operator += (float f)
+	__forceinline void operator += (float f)
 	{
 		*this += GSVector4(f);
 	}
 
-	void operator -= (float f)
+	__forceinline void operator -= (float f)
 	{
 		*this -= GSVector4(f);
 	}
 
-	void operator *= (float f)
+	__forceinline void operator *= (float f)
 	{
 		*this *= GSVector4(f);
 	}
 
-	void operator /= (float f)
+	__forceinline void operator /= (float f)
 	{
 		*this /= GSVector4(f);
 	}
 
-	void operator &= (const GSVector4& v)
+	__forceinline void operator &= (const GSVector4& v)
 	{
 		m = _mm_and_ps(m, v);
 	}
 
-	void operator |= (const GSVector4& v)
+	__forceinline void operator |= (const GSVector4& v)
 	{
 		m = _mm_or_ps(m, v);
 	}
 
-	void operator ^= (const GSVector4& v)
+	__forceinline void operator ^= (const GSVector4& v)
 	{
 		m = _mm_xor_ps(m, v);
 	}
 
-	friend GSVector4 operator + (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator + (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_add_ps(v1, v2));
 	}
 
-	friend GSVector4 operator - (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator - (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_sub_ps(v1, v2));
 	}
 
-	friend GSVector4 operator * (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator * (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_mul_ps(v1, v2));
 	}
 
-	friend GSVector4 operator / (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator / (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_div_ps(v1, v2));
 	}
 
-	friend GSVector4 operator + (const GSVector4& v, float f)
+	__forceinline friend GSVector4 operator + (const GSVector4& v, float f)
 	{
 		return v + GSVector4(f);
 	}
 
-	friend GSVector4 operator - (const GSVector4& v, float f)
+	__forceinline friend GSVector4 operator - (const GSVector4& v, float f)
 	{
 		return v - GSVector4(f);
 	}
 
-	friend GSVector4 operator * (const GSVector4& v, float f)
+	__forceinline friend GSVector4 operator * (const GSVector4& v, float f)
 	{
 		return v * GSVector4(f);
 	}
 
-	friend GSVector4 operator / (const GSVector4& v, float f)
+	__forceinline friend GSVector4 operator / (const GSVector4& v, float f)
 	{
 		return v / GSVector4(f);
 	}
 
-	friend GSVector4 operator & (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator & (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_and_ps(v1, v2));
 	}
 
-	friend GSVector4 operator | (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator | (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_or_ps(v1, v2));
 	}
 
-	friend GSVector4 operator ^ (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator ^ (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_xor_ps(v1, v2));
 	}
 
-	friend GSVector4 operator == (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator == (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_cmpeq_ps(v1, v2));
 	}
 
-	friend GSVector4 operator != (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator != (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_cmpneq_ps(v1, v2));
 	}
 
-	friend GSVector4 operator > (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator > (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_cmpgt_ps(v1, v2));
 	}
 
-	friend GSVector4 operator < (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator < (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_cmplt_ps(v1, v2));
 	}
 
-	friend GSVector4 operator >= (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator >= (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_cmpge_ps(v1, v2));
 	}
 
-	friend GSVector4 operator <= (const GSVector4& v1, const GSVector4& v2)
+	__forceinline friend GSVector4 operator <= (const GSVector4& v1, const GSVector4& v2)
 	{
 		return GSVector4(_mm_cmple_ps(v1, v2));
 	}
 
-	template<int i> GSVector4 shuffle() const
+	template<int i> __forceinline GSVector4 shuffle() const
 	{
 		return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(i, i, i, i)));
 	}
 
 	#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
-		GSVector4 xs##ys##zs##ws() const {return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
-		GSVector4 xs##ys##zs##ws(const GSVector4& v) const {return GSVector4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4 xs##ys##zs##ws() const {return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4 xs##ys##zs##ws(const GSVector4& v) const {return GSVector4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
 
 	#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
 		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
@@ -2863,8 +2865,8 @@ public:
 		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
 
 	#define VECTOR4_SHUFFLE_1(xs, xn) \
-		GSVector4 xs##4() const {return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
-		GSVector4 xs##4(const GSVector4& v) const {return GSVector4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
+		__forceinline GSVector4 xs##4() const {return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
+		__forceinline GSVector4 xs##4(const GSVector4& v) const {return GSVector4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
 		VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
 		VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
 		VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
@@ -2876,4 +2878,541 @@ public:
 	VECTOR4_SHUFFLE_1(w, 3)
 };
 
+__forceinline GSVector4i::GSVector4i(const GSVector4& v)
+{
+	m = _mm_cvttps_epi32(v);
+}
+
+__forceinline GSVector4::GSVector4(const GSVector4i& v)
+{
+	m = _mm_cvtepi32_ps(v);
+}
+
+__forceinline GSVector4i GSVector4i::cast(const GSVector4& v)
+{
+	return GSVector4i(_mm_castps_si128(v.m));
+}
+
+__forceinline GSVector4 GSVector4::cast(const GSVector4i& v)
+{
+	return GSVector4(_mm_castsi128_ps(v.m));
+}
+
+#if _M_SSE >= 0x500
+
+class GSVector8;
+
+__aligned32 class GSVector8i
+{
+public:
+	union
+	{
+		struct {int x0, y0, z0, w0, x1, y1, z1, w1;};
+		struct {int r0, g0, b0, a0, r1, g1, b1, a1;};
+		int v[8];
+		float f32[8];
+		int8 i8[32];
+		int16 i16[16];
+		int32 i32[8];
+		int64 i64[4];
+		uint8 u8[32];
+		uint16 u16[16];
+		uint32 u32[8];
+		uint64 u64[4];
+		__m256i m;
+	};
+
+	__forceinline GSVector8i()
+	{
+	}
+
+	__forceinline GSVector8i(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
+	{
+		m = _mm256_set_epi32(w0, z0, y0, x0, w0, z0, y0, x0);
+	}
+
+	__forceinline GSVector8i(__m128i m0, __m128i m1)
+	{
+		m = _mm256_insertf128_si256(_mm256_insertf128_si256(zero(), m0, 0), m1, 1);
+	}
+
+	__forceinline GSVector8i(const GSVector8i& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline explicit GSVector8i(int i)
+	{
+		m = _mm256_set1_epi32(i);
+	}
+
+	__forceinline explicit GSVector8i(__m128i m)
+	{
+		this->m = _mm256_insertf128_si256(_mm256_insertf128_si256(zero(), m, 0), m, 1);
+	}
+
+	__forceinline explicit GSVector8i(__m256i m)
+	{
+		this->m = m;
+	}
+
+	__forceinline explicit GSVector8i(const GSVector8& v);
+
+	__forceinline void operator = (const GSVector8i& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline void operator = (int i)
+	{
+		m = _mm256_set1_epi32(i);
+	}
+
+	__forceinline void operator = (__m128i m)
+	{
+		this->m = _mm256_insertf128_si256(_mm256_insertf128_si256(zero(), m, 0), m, 1);
+	}
+
+	__forceinline void operator = (__m256i m)
+	{
+		this->m = m;
+	}
+
+	__forceinline operator __m256i() const
+	{
+		return m;
+	}
+
+	static GSVector8i cast(const GSVector8& v);
+
+	// TODO
+
+	__forceinline static GSVector8i zero()
+	{
+		return GSVector8i(_mm256_setzero_si256());
+	}
+
+	// TODO
+
+	template<bool aligned> __forceinline static GSVector8i load(const void* p)
+	{
+		return GSVector8i(aligned ? _mm_load256i_si256((__m256i*)p) : _mm256i_loadu_si256((__m128i*)p));
+	}
+
+	template<bool aligned> __forceinline static void store(void* p, const GSVector4i& v)
+	{
+		if(aligned) _mm256i_store_si256((__m256i*)p, v.m);
+		else _mm256i_storeu_si256((__m256i*)p, v.m);
+	}
+};
+
+__aligned32 class GSVector8
+{
+public:
+	union
+	{
+		struct {float x0, y0, z0, w0, x1, y1, z1, w1;};
+		struct {float r0, g0, b0, a0, r1, g1, b1, a1;};
+		float v[8];
+		float f32[8];
+		int8 i8[32];
+		int16 i16[16];
+		int32 i32[8];
+		int64 i64[4];
+		uint8 u8[32];
+		uint16 u16[16];
+		uint32 u32[8];
+		uint64 u64[4];
+		__m256 m;
+	};
+
+	__forceinline GSVector8()
+	{
+	}
+
+	__forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1)
+	{
+		m = _mm256_set_ps(w0, z0, y0, x0, w0, z0, y0, x0);
+	}
+
+	__forceinline GSVector8(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
+	{
+		GSVector8i v(x0, y0, z0, w0, x1, y1, z1, w1);
+
+		m = _mm256_cvtepi32_ps(v);
+	}
+
+	__forceinline GSVector8(__m128 m0, __m128 m1)
+	{
+		m = _mm256_insertf128_ps(_mm256_insertf128_ps(zero(), m0, 0), m1, 1);
+	}
+
+	__forceinline GSVector8(const GSVector8& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline explicit GSVector8(float f)
+	{
+		m = _mm256_set1_ps(f); // _mm256_broadcast_ss(&f); ?
+	}
+
+	__forceinline explicit GSVector8(__m128 m)
+	{
+		this->m = _mm256_insertf128_ps(_mm256_insertf128_ps(zero(), m, 0), m, 1);
+	}
+
+	__forceinline explicit GSVector8(__m256 m)
+	{
+		this->m = m;
+	}
+
+	__forceinline explicit GSVector8(const GSVector8i& v);
+
+	__forceinline void operator = (const GSVector8& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline void operator = (float f)
+	{
+		m = _mm256_set1_ps(f);
+	}
+
+	__forceinline void operator = (__m128 m)
+	{
+		this->m = _mm256_insertf128_ps(_mm256_insertf128_ps(zero(), m, 0), m, 1);
+	}
+
+	__forceinline void operator = (__m256 m)
+	{
+		this->m = m;
+	}
+
+	__forceinline operator __m256() const
+	{
+		return m;
+	}
+
+	__forceinline static GSVector8 cast(const GSVector8i& v);
+
+	__forceinline GSVector8 abs() const
+	{
+		return *this & cast(GSVector8i(GSVector4i::x7fffffff()));
+	}
+
+	__forceinline GSVector8 neg() const
+	{
+		return *this ^ cast(GSVector8i(GSVector4i::x80000000()));
+	}
+
+	__forceinline GSVector8 rcp() const
+	{
+		return GSVector8(_mm256_rcp_ps(m));
+	}
+
+	__forceinline GSVector8 rcpnr() const
+	{
+		GSVector8 v = rcp();
+
+		return (v + v) - (v * v) * *this;
+	}
+
+	enum RoundMode {NearestInt = 8, NegInf = 9, PosInf = 10, Truncate = 11};
+
+	template<int mode> __forceinline GSVector8 round() const
+	{
+		return GSVector8(_mm256_round_ps(m, mode));
+	}
+
+	__forceinline GSVector8 floor() const
+	{
+		return round<NegInf>();
+	}
+
+	__forceinline GSVector8 ceil() const
+	{
+		return round<PosInf>();
+	}
+
+	// TODO
+
+	__forceinline GSVector8 blend8(const GSVector8& a, const GSVector8& mask)  const
+	{
+		return GSVector8(_mm256_blendv_ps(m, a, mask));
+	}
+
+	__forceinline GSVector8 upl32(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_unpacklo_ps(m, a));
+	}
+
+	__forceinline GSVector8 uph32(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_unpackhi_ps(m, a));
+	}
+
+	__forceinline GSVector8 upl64(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(m), _mm256_castps_pd(a))));
+	}
+
+	__forceinline GSVector8 uph64(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(m), _mm256_castps_pd(a))));
+	}
+
+	// TODO
+
+	__forceinline GSVector8 andnot(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_andnot_ps(v.m, m));
+	}
+
+	__forceinline int mask() const
+	{
+		return _mm256_movemask_ps(m);
+	}
+
+	__forceinline bool alltrue() const
+	{
+		return _mm256_movemask_ps(m) == 0xff;
+	}
+
+	__forceinline bool allfalse() const
+	{
+		return _mm256_movemask_ps(m) == 0;
+	}
+
+	template<int i> __forceinline GSVector4 extract() const
+	{
+		return GSVector4(_mm256_extractf128_ps(m, i));
+	}
+
+	// TODO: insert
+
+	__forceinline static GSVector8 zero()
+	{
+		return GSVector8(_mm256_setzero_ps());
+	}
+
+	__forceinline static GSVector8 xffffffff()
+	{
+		return zero() == zero();
+	}
+
+	// TODO: load low, ss
+
+	template<bool aligned> __forceinline static GSVector8 load(const void* p)
+	{
+		return GSVector8(aligned ? _mm256_load_ps((const float*)p) : _mm256_loadu_ps((const float*)p));
+	}
+
+	// TODO: store low, ss
+
+	template<bool aligned> __forceinline static void store(void* p, const GSVector8& v)
+	{
+		if(aligned) _mm256_store_ps((float*)p, v.m);
+		else _mm256_storeu_ps((float*)p, v.m);
+	}
+
+	// TODO
+
+	__forceinline GSVector8 operator - () const
+	{
+		return neg();
+	}
+
+	__forceinline void operator += (const GSVector8& v)
+	{
+		m = _mm256_add_ps(m, v);
+	}
+
+	__forceinline void operator -= (const GSVector8& v)
+	{
+		m = _mm256_sub_ps(m, v);
+	}
+
+	__forceinline void operator *= (const GSVector8& v)
+	{
+		m = _mm256_mul_ps(m, v);
+	}
+
+	__forceinline void operator /= (const GSVector8& v)
+	{
+		m = _mm256_div_ps(m, v);
+	}
+
+	__forceinline void operator += (float f)
+	{
+		*this += GSVector8(f);
+	}
+
+	__forceinline void operator -= (float f)
+	{
+		*this -= GSVector8(f);
+	}
+
+	__forceinline void operator *= (float f)
+	{
+		*this *= GSVector8(f);
+	}
+
+	__forceinline void operator /= (float f)
+	{
+		*this /= GSVector8(f);
+	}
+
+	__forceinline void operator &= (const GSVector8& v)
+	{
+		m = _mm256_and_ps(m, v);
+	}
+
+	__forceinline void operator |= (const GSVector8& v)
+	{
+		m = _mm256_or_ps(m, v);
+	}
+
+	__forceinline void operator ^= (const GSVector8& v)
+	{
+		m = _mm256_xor_ps(m, v);
+	}
+
+	__forceinline friend GSVector8 operator + (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_add_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator - (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_sub_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator * (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_mul_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator / (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_div_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator + (const GSVector8& v, float f)
+	{
+		return v + GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator - (const GSVector8& v, float f)
+	{
+		return v - GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator * (const GSVector8& v, float f)
+	{
+		return v * GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator / (const GSVector8& v, float f)
+	{
+		return v / GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator & (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_and_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator | (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_or_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator ^ (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_xor_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator == (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_EQ_OQ));
+	}
+
+	__forceinline friend GSVector8 operator != (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_NEQ_OQ));
+	}
+
+	__forceinline friend GSVector8 operator > (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_GT_OQ));
+	}
+
+	__forceinline friend GSVector8 operator < (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LT_OQ));
+	}
+
+	__forceinline friend GSVector8 operator >= (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_GE_OQ));
+	}
+
+	__forceinline friend GSVector8 operator <= (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ));
+	}
+
+	#define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+
+	#define VECTOR8_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+	#define VECTOR8_SHUFFLE_2(xs, xn, ys, yn) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+	#define VECTOR8_SHUFFLE_1(xs, xn) \
+		__forceinline GSVector8 xs##4() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
+		__forceinline GSVector8 xs##4(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
+		VECTOR8_SHUFFLE_2(xs, xn, x, 0) \
+		VECTOR8_SHUFFLE_2(xs, xn, y, 1) \
+		VECTOR8_SHUFFLE_2(xs, xn, z, 2) \
+		VECTOR8_SHUFFLE_2(xs, xn, w, 3) \
+
+	VECTOR8_SHUFFLE_1(x, 0)
+	VECTOR8_SHUFFLE_1(y, 1)
+	VECTOR8_SHUFFLE_1(z, 2)
+	VECTOR8_SHUFFLE_1(w, 3)
+};
+
+__forceinline GSVector8i::GSVector8i(const GSVector8& v)
+{
+	m = _mm256_cvttps_epi32(v);
+}
+
+__forceinline GSVector8::GSVector8(const GSVector8i& v)
+{
+	m = _mm256_cvtepi32_ps(v);
+}
+
+__forceinline GSVector8i GSVector8i::cast(const GSVector8& v)
+{
+	return GSVector8i(_mm256_castps_si256(v.m));
+}
+
+__forceinline GSVector8 GSVector8::cast(const GSVector8i& v)
+{
+	return GSVector8(_mm256_castsi256_ps(v.m));
+}
+
+#endif
+
 #pragma pack(pop)
diff --git a/plugins/GSdx/GSVertex.h b/plugins/GSdx/GSVertex.h
index 99a48b37d1..822564b4c1 100644
--- a/plugins/GSdx/GSVertex.h
+++ b/plugins/GSdx/GSVertex.h
@@ -28,7 +28,7 @@
 
 #pragma pack(push, 1)
 
-__aligned16 struct GSVertex
+__aligned32 struct GSVertex
 {
 	union
 	{
diff --git a/plugins/GSdx/GSVertexHW.h b/plugins/GSdx/GSVertexHW.h
index 4b640ecf9c..a5872016a9 100644
--- a/plugins/GSdx/GSVertexHW.h
+++ b/plugins/GSdx/GSVertexHW.h
@@ -26,7 +26,7 @@
 
 #pragma pack(push, 1)
 
-__aligned16 union GSVertexHW9
+__aligned32 union GSVertexHW9
 {
 	struct
 	{
@@ -56,7 +56,7 @@ __aligned16 union GSVertexHW9
 	float GetQ() {return p.w;}
 };
 
-__aligned16 union GSVertexHW11
+__aligned32 union GSVertexHW11
 {
 	struct
 	{
diff --git a/plugins/GSdx/GSVertexList.h b/plugins/GSdx/GSVertexList.h
index daf60e6a57..adc6efda1d 100644
--- a/plugins/GSdx/GSVertexList.h
+++ b/plugins/GSdx/GSVertexList.h
@@ -31,7 +31,7 @@ public:
 	GSVertexList()
 		: m_count(0)
 	{
-		m_base = _aligned_malloc(sizeof(Vertex) * countof(m_v), 16);
+		m_base = _aligned_malloc(sizeof(Vertex) * countof(m_v), 32);
 
 		for(int i = 0; i < countof(m_v); i++)
 		{
diff --git a/plugins/GSdx/GSVertexSW.h b/plugins/GSdx/GSVertexSW.h
index 722efb5dd1..b4022db189 100644
--- a/plugins/GSdx/GSVertexSW.h
+++ b/plugins/GSdx/GSVertexSW.h
@@ -23,12 +23,16 @@
 
 #include "GSVector.h"
 
-__aligned16 union GSVertexSW
+__aligned32 union GSVertexSW
 {
 	struct {GSVector4 c, p, t;};
 	struct {GSVector4 v[3];};
 	struct {float f[12];};
 
+	#if _M_SSE >= 0x500
+	struct {GSVector8 cp, t_;};
+	#endif
+
 	GSVertexSW() {}
 	GSVertexSW(const GSVertexSW& v) {*this = v;}
 
@@ -213,4 +217,3 @@ __forceinline GSVertexSW operator / (const GSVertexSW& v, float f)
 	v0.t = v.t / vf;
 	return v0;
 }
-
diff --git a/plugins/GSdx/GSVertexTrace.cpp b/plugins/GSdx/GSVertexTrace.cpp
index 30bb54d229..72f2d0d45e 100644
--- a/plugins/GSdx/GSVertexTrace.cpp
+++ b/plugins/GSdx/GSVertexTrace.cpp
@@ -120,8 +120,8 @@ void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primc
 
 using namespace Xbyak;
 
-GSVertexTrace::CGSW::CGSW(uint32 key, void* ptr, size_t maxsize)
-	: CodeGenerator(maxsize, ptr)
+GSVertexTrace::CGSW::CGSW(uint32 key, void* code, size_t maxsize)
+	: CodeGenerator(maxsize, code)
 {
 	#if _M_AMD64
 	#error TODO
@@ -161,10 +161,10 @@ GSVertexTrace::CGSW::CGSW(uint32 key, void* ptr, size_t maxsize)
 	static const float fmin = -FLT_MAX;
 	static const float fmax = FLT_MAX;
 
-	movss(xmm0, xmmword[&fmax]);
+	movss(xmm0, ptr[&fmax]);
 	shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
 
-	movss(xmm1, xmmword[&fmin]);
+	movss(xmm1, ptr[&fmin]);
 	shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
 
 	if(color)
@@ -202,7 +202,7 @@ L("loop");
 
 	if(tme && !fst && primclass == GS_SPRITE_CLASS)
 	{
-		movaps(xmm1, xmmword[edx + 1 * sizeof(GSVertexSW) + 32]);
+		movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
 		shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
 	}
 
@@ -213,7 +213,7 @@ L("loop");
 			// min.c = min.c.minv(v[i + j].c);
 			// max.c = max.c.maxv(v[i + j].c);
 
-			movaps(xmm0, xmmword[edx + j * sizeof(GSVertexSW)]);
+			movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
 
 			minps(xmm2, xmm0);
 			maxps(xmm3, xmm0);
@@ -222,7 +222,7 @@ L("loop");
 		// min.p = min.p.minv(v[i + j].p);
 		// max.p = max.p.maxv(v[i + j].p);
 
-		movaps(xmm0, xmmword[edx + j * sizeof(GSVertexSW) + 16]);
+		movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
 
 		minps(xmm4, xmm0);
 		maxps(xmm5, xmm0);
@@ -232,7 +232,7 @@ L("loop");
 			// min.t = min.t.minv(v[i + j].t);
 			// max.t = max.t.maxv(v[i + j].t);
 
-			movaps(xmm0, xmmword[edx + j * sizeof(GSVertexSW) + 32]);
+			movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
 
 			if(!fst)
 			{
@@ -265,27 +265,27 @@ L("loop");
 	{
 		cvttps2dq(xmm2, xmm2);
 		psrld(xmm2, 7);
-		movaps(xmmword[eax], xmm2);
+		movaps(ptr[eax], xmm2);
 
 		cvttps2dq(xmm3, xmm3);
 		psrld(xmm3, 7);
-		movaps(xmmword[edx], xmm3);
+		movaps(ptr[edx], xmm3);
 	}
 
-	movaps(xmmword[eax + 16], xmm4);
-	movaps(xmmword[edx + 16], xmm5);
+	movaps(ptr[eax + 16], xmm4);
+	movaps(ptr[edx + 16], xmm5);
 
 	if(tme)
 	{
-		movaps(xmmword[eax + 32], xmm6);
-		movaps(xmmword[edx + 32], xmm7);
+		movaps(ptr[eax + 32], xmm6);
+		movaps(ptr[edx + 32], xmm7);
 	}
 
 	ret();
 }
 
-GSVertexTrace::CGHW9::CGHW9(uint32 key, void* ptr, size_t maxsize)
-	: CodeGenerator(maxsize, ptr)
+GSVertexTrace::CGHW9::CGHW9(uint32 key, void* code, size_t maxsize)
+	: CodeGenerator(maxsize, code)
 {
 	#if _M_AMD64
 	#error TODO
@@ -327,10 +327,10 @@ GSVertexTrace::CGHW9::CGHW9(uint32 key, void* ptr, size_t maxsize)
 	static const float fmin = -FLT_MAX;
 	static const float fmax = FLT_MAX;
 
-	movss(xmm0, xmmword[&fmax]);
+	movss(xmm0, ptr[&fmax]);
 	shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
 
-	movss(xmm1, xmmword[&fmin]);
+	movss(xmm1, ptr[&fmin]);
 	shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
 
 	if(color)
@@ -368,7 +368,7 @@ L("loop");
 
 	if(tme && !fst && primclass == GS_SPRITE_CLASS)
 	{
-		movaps(xmm1, xmmword[edx + 5 * sizeof(GSVertexHW9) + 16]);
+		movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
 		shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
 	}
 
@@ -377,7 +377,7 @@ L("loop");
 		// min.p = min.p.minv(v[i + j].p);
 		// max.p = max.p.maxv(v[i + j].p);
 
-		movaps(xmm0, xmmword[edx + j * sizeof(GSVertexHW9) + 16]);
+		movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
 
 		minps(xmm4, xmm0);
 		maxps(xmm5, xmm0);
@@ -390,7 +390,7 @@ L("loop");
 
 		if(color && (iip || j == n - 1) || tme)
 		{
-			movaps(xmm0, xmmword[edx + j * sizeof(GSVertexHW9)]);
+			movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
 		}
 
 		if(color && (iip || j == n - 1))
@@ -455,15 +455,15 @@ L("loop");
 			punpcklwd(xmm3, xmm0);
 		}
 
-		movaps(xmmword[eax], xmm2);
-		movaps(xmmword[edx], xmm3);
+		movaps(ptr[eax], xmm2);
+		movaps(ptr[edx], xmm3);
 	}
 
 	// m_min.p = pmin;
 	// m_max.p = pmax;
 
-	movaps(xmmword[eax + 16], xmm4);
-	movaps(xmmword[edx + 16], xmm5);
+	movaps(ptr[eax + 16], xmm4);
+	movaps(ptr[edx + 16], xmm5);
 
 	if(tme)
 	{
@@ -473,15 +473,15 @@ L("loop");
 		shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
 		shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
 
-		movaps(xmmword[eax + 32], xmm6);
-		movaps(xmmword[edx + 32], xmm7);
+		movaps(ptr[eax + 32], xmm6);
+		movaps(ptr[edx + 32], xmm7);
 	}
 
 	ret();
 }
 
-GSVertexTrace::CGHW11::CGHW11(uint32 key, void* ptr, size_t maxsize)
-	: CodeGenerator(maxsize, ptr)
+GSVertexTrace::CGHW11::CGHW11(uint32 key, void* code, size_t maxsize)
+	: CodeGenerator(maxsize, code)
 {
 	#if _M_AMD64
 	#error TODO
@@ -521,10 +521,10 @@ GSVertexTrace::CGHW11::CGHW11(uint32 key, void* ptr, size_t maxsize)
 	static const float fmin = -FLT_MAX;
 	static const float fmax = FLT_MAX;
 
-	movss(xmm0, xmmword[&fmax]);
+	movss(xmm0, ptr[&fmax]);
 	shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
 
-	movss(xmm1, xmmword[&fmin]);
+	movss(xmm1, ptr[&fmin]);
 	shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
 
 	if(color)
@@ -564,7 +564,7 @@ L("loop");
 	{
 		if(color && (iip || j == n - 1) || tme)
 		{
-			movaps(xmm0, xmmword[edx + j * sizeof(GSVertexHW11)]);
+			movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
 		}
 
 		if(color && (iip || j == n - 1))
@@ -593,7 +593,7 @@ L("loop");
 			maxps(xmm7, xmm0);
 		}
 
-		movdqa(xmm0, xmmword[edx + j * sizeof(GSVertexHW11) + 16]);
+		movdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
 
 		if(m_cpu.has(util::Cpu::tSSE41))
 		{
@@ -648,8 +648,8 @@ L("loop");
 			punpcklwd(xmm3, xmm0);
 		}
 
-		movaps(xmmword[eax], xmm2);
-		movaps(xmmword[edx], xmm3);
+		movaps(ptr[eax], xmm2);
+		movaps(ptr[edx], xmm3);
 	}
 
 	// m_min.p = pmin.xyww();
@@ -658,16 +658,16 @@ L("loop");
 	shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
 	shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
 
-	movaps(xmmword[eax + 16], xmm4);
-	movaps(xmmword[edx + 16], xmm5);
+	movaps(ptr[eax + 16], xmm4);
+	movaps(ptr[edx + 16], xmm5);
 
 	if(tme)
 	{
 		// m_min.t = tmin;
 		// m_max.t = tmax;
 
-		movaps(xmmword[eax + 32], xmm6);
-		movaps(xmmword[edx + 32], xmm7);
+		movaps(ptr[eax + 32], xmm6);
+		movaps(ptr[edx + 32], xmm7);
 	}
 
 	ret();
diff --git a/plugins/GSdx/GSVertexTrace.h b/plugins/GSdx/GSVertexTrace.h
index cbc199d528..bb34894d25 100644
--- a/plugins/GSdx/GSVertexTrace.h
+++ b/plugins/GSdx/GSVertexTrace.h
@@ -31,7 +31,7 @@
 
 class GSState;
 
-__aligned16 class GSVertexTrace
+__aligned32 class GSVertexTrace
 {
 	struct Vertex {GSVector4i c; GSVector4 p, t;};
 	struct VertexAlpha {int min, max; bool valid;};
@@ -41,14 +41,14 @@ __aligned16 class GSVertexTrace
 	class CGSW : public Xbyak::CodeGenerator
 	{
 	public:
-		CGSW(uint32 key, void* ptr, size_t maxsize);
+		CGSW(uint32 key, void* code, size_t maxsize);
 	};
 
 	class GSVertexTraceMapSW : public GSCodeGeneratorFunctionMap<CGSW, uint32, VertexTracePtr>
 	{
 	public:
 		GSVertexTraceMapSW() : GSCodeGeneratorFunctionMap("VertexTraceSW") {}
-		CGSW* Create(uint32 key, void* ptr, size_t maxsize) {return new CGSW(key, ptr, maxsize);}
+		CGSW* Create(uint32 key, void* code, size_t maxsize) {return new CGSW(key, code, maxsize);}
 	};
 
 	class CGHW9 : public Xbyak::CodeGenerator
@@ -63,7 +63,7 @@ __aligned16 class GSVertexTrace
 	{
 	public:
 		GSVertexTraceMapHW9() : GSCodeGeneratorFunctionMap("VertexTraceHW9") {}
-		CGHW9* Create(uint32 key, void* ptr, size_t maxsize) {return new CGHW9(key, ptr, maxsize);}
+		CGHW9* Create(uint32 key, void* code, size_t maxsize) {return new CGHW9(key, code, maxsize);}
 	};
 
 	class CGHW11 : public Xbyak::CodeGenerator
@@ -78,7 +78,7 @@ __aligned16 class GSVertexTrace
 	{
 	public:
 		GSVertexTraceMapHW11() : GSCodeGeneratorFunctionMap("VertexTraceHW11") {}
-		CGHW11* Create(uint32 key, void* ptr, size_t maxsize) {return new CGHW11(key, ptr, maxsize);}
+		CGHW11* Create(uint32 key, void* code, size_t maxsize) {return new CGHW11(key, code, maxsize);}
 	};
 
 	GSVertexTraceMapSW m_map_sw;
diff --git a/plugins/GSdx/GSWnd.cpp b/plugins/GSdx/GSWnd.cpp
index 07da8a22d7..dbb15924d2 100644
--- a/plugins/GSdx/GSWnd.cpp
+++ b/plugins/GSdx/GSWnd.cpp
@@ -174,6 +174,7 @@ GSVector4i GSWnd::GetClientRect()
 
 // Returns FALSE if the window has no title, or if th window title is under the strict
 // management of the emulator.
+
 bool GSWnd::SetWindowText(const char* title)
 {
 	if( !m_IsManaged ) return false;
diff --git a/plugins/GSdx/GSdx.def b/plugins/GSdx/GSdx.def
index eaa5777897..265ee44ff9 100644
--- a/plugins/GSdx/GSdx.def
+++ b/plugins/GSdx/GSdx.def
@@ -40,4 +40,4 @@ EXPORTS
 	GSgetLastTag
 	GSReplay
 	GSBenchmark
-	GSgetTitleInfo2
\ No newline at end of file
+	GSgetTitleInfo2
diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h
index 84ca84913a..85eb2dca3d 100644
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@@ -57,6 +57,7 @@
 #include <algorithm>
 
 // Let's take advantage of the work that's already been done on making things cross-platform by bringing this in.
+
 #include "Pcsx2Defs.h"
 
 using namespace std;
@@ -126,7 +127,7 @@ typedef signed long long int64;
 
 #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)
 
-#define USE_UPSCALE_HACKS //Hacks intended to fix upscaling / rendering glitches in HW renderers
+#define USE_UPSCALE_HACKS // Hacks intended to fix upscaling / rendering glitches in HW renderers
 
 // dxsdk beta missing these:
 #define D3D11_SHADER_MACRO D3D10_SHADER_MACRO
diff --git a/plugins/GSdx/xbyak/xbyak.h b/plugins/GSdx/xbyak/xbyak.h
index 8ca512b078..58a5eda245 100644
--- a/plugins/GSdx/xbyak/xbyak.h
+++ b/plugins/GSdx/xbyak/xbyak.h
@@ -1,12 +1,12 @@
-#ifndef XBYAK_H_
-#define XBYAK_H_
+#ifndef XBYAK_XBYAK_H_
+#define XBYAK_XBYAK_H_
 /*!
 	@file xbyak.h
 	@brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++
 	@author herumi
-	@version $Revision: 1.157 $
+	@version $Revision: 1.238 $
 	@url http://homepage1.nifty.com/herumi/soft/xbyak.html
-	@date $Date: 2008/12/30 04:53:11 $
+	@date $Date: 2011/02/04 03:46:09 $
 	@note modified new BSD license
 	http://www.opensource.org/licenses/bsd-license.php
 */
@@ -15,9 +15,12 @@
 #include <assert.h>
 #include <map>
 #include <string>
-#ifdef __GNUC__
-#include <unistd.h>
-#include <sys/mman.h>
+#include <algorithm>
+#ifdef _WIN32
+	#include <windows.h>
+#elif defined(__GNUC__)
+	#include <unistd.h>
+	#include <sys/mman.h>
 #endif
 
 #ifdef __x86_64__
@@ -45,13 +48,6 @@
 			#pragma warning(disable : 4127) /* condition is constant(for "if" trick) */
 		#endif
 	#endif
-	#include <windows.h>
-#endif
-
-#ifndef NUM_OF_ARRAY
-//	template<class T, int N>
-//	size_t num_of_array(const T (&)[N]) { return N; }
-	#define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(*x))
 #endif
 
 namespace Xbyak {
@@ -59,29 +55,35 @@ namespace Xbyak {
 #include "xbyak_bin2hex.h"
 
 enum {
-	DEFAULT_MAX_CODE_SIZE = 2048,
-	VERSION = 0x2070, /* 0xABCD = A.BC(D) */
+	DEFAULT_MAX_CODE_SIZE = 4096,
+	VERSION = 0x2990, /* 0xABCD = A.BC(D) */
 };
 /*
-#ifndef MIE_DEFINED_UINT32
-	#define MIE_DEFINED_UINT32
-	#ifdef _MSC_VER
-		typedef unsigned __int64 uint64;
-	#else
-		typedef unsigned long long uint64;
-	#endif
-	typedef unsigned int uint32;
-	typedef unsigned short uint16;
-	typedef unsigned char uint8;
-	#ifndef MIE_ALIGN
-		#ifdef _MSC_VER
-			#define MIE_ALIGN(x) __declspec(align(x))
-		#else
-			#define MIE_ALIGN(x) __attribute__((aligned(x)))
-		#endif
-	#endif
+#ifndef MIE_INTEGER_TYPE_DEFINED
+#define MIE_INTEGER_TYPE_DEFINED
+#ifdef _MSC_VER
+	typedef unsigned __int64 uint64;
+	typedef __int64 sint64;
+#else
+	typedef unsigned long long uint64;
+	typedef long long sint64;
+#endif
+typedef unsigned int uint32;
+typedef unsigned short uint16;
+typedef unsigned char uint8;
 #endif
 */
+#ifndef MIE_ALIGN
+	#ifdef _MSC_VER
+		#define MIE_ALIGN(x) __declspec(align(x))
+	#else
+		#define MIE_ALIGN(x) __attribute__((aligned(x)))
+	#endif
+#endif
+#ifndef MIE_PACK // for shufps
+	#define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w))
+#endif
+
 enum Error {
 	ERR_NONE = 0,
 	ERR_BAD_ADDRESSING,
@@ -101,6 +103,10 @@ enum Error {
 	ERR_CANT_USE_64BIT_DISP,
 	ERR_OFFSET_IS_TOO_BIG,
 	ERR_MEM_SIZE_IS_NOT_SPECIFIED,
+	ERR_BAD_MEM_SIZE,
+	ERR_BAD_ST_COMBINATION,
+	ERR_OVER_LOCAL_LABEL,
+	ERR_UNDER_LOCAL_LABEL,
 	ERR_INTERNAL
 };
 
@@ -125,6 +131,10 @@ static inline const char *ConvertErrorToString(Error err)
 		"can't use 64bit disp(use (void*))",
 		"offset is too big",
 		"MEM size is not specified",
+		"bad mem size",
+		"bad st combination",
+		"over local label",
+		"under local label",
 		"internal error",
 	};
 	if (err < 0 || err > ERR_INTERNAL) return 0;
@@ -135,7 +145,7 @@ namespace inner {
 
 enum { debug = 1 };
 
-static inline uint32 GetPtrDist(const void *p1, const void *p2 = 0)
+static inline uint32 GetPtrDist(const void *p1, const void *p2)
 {
 	uint64 diff = static_cast<const char *>(p1) - static_cast<const char *>(p2);
 #ifdef XBYAK64
@@ -145,6 +155,7 @@ static inline uint32 GetPtrDist(const void *p1, const void *p2 = 0)
 }
 
 static inline bool IsInDisp8(uint32 x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
+static inline bool IsInInt32(uint64 x) { return 0xFFFFFFFF80000000ULL <= x || x <= 0x7FFFFFFFU; }
 
 }
 
@@ -163,7 +174,8 @@ public:
 		REG = 1 << 3,
 		MMX = 1 << 4,
 		XMM = 1 << 5,
-		FPU = 1 << 6
+		FPU = 1 << 6,
+		YMM = 1 << 7
 	};
 	enum Code {
 #ifdef XBYAK64
@@ -191,10 +203,11 @@ public:
 	bool isNone() const { return kind_ == 0; }
 	bool isMMX() const { return is(MMX); }
 	bool isXMM() const { return is(XMM); }
+	bool isYMM() const { return is(YMM); }
 	bool isREG(int bit = 0) const { return is(REG, bit); }
 	bool isMEM(int bit = 0) const { return is(MEM, bit); }
+	bool isFPU() const { return is(FPU); }
 	bool isExt8bit() const { return ext8bit_ != 0; }
-	Operand changeBit(int bit) const { return Operand(idx_, static_cast<Kind>(kind_), bit, ext8bit_); }
 	// any bit is accetable if bit == 0
 	bool is(int kind, uint32 bit = 0) const
 	{
@@ -216,12 +229,18 @@ public:
 				{ "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",  "r11", "r12", "r13", "r14", "r15" },
 			};
 			return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx_];
-		} else if (isMMX()) {
-			static const char tbl[8][4] = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" };
+		} else if (isYMM()) {
+			static const char tbl[16][5] = { "ym0", "ym1", "ym2", "ym3", "ym4", "ym5", "ym6", "ym7", "ym8", "ym9", "ym10", "ym11", "ym12", "ym13", "ym14", "ym15" };
 			return tbl[idx_];
 		} else if (isXMM()) {
 			static const char tbl[16][5] = { "xm0", "xm1", "xm2", "xm3", "xm4", "xm5", "xm6", "xm7", "xm8", "xm9", "xm10", "xm11", "xm12", "xm13", "xm14", "xm15" };
 			return tbl[idx_];
+		} else if (isMMX()) {
+			static const char tbl[8][4] = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" };
+			return tbl[idx_];
+		} else if (isFPU()) {
+			static const char tbl[8][4] = { "st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7" };
+			return tbl[idx_];
 		}
 		throw ERR_INTERNAL;
 	}
@@ -229,14 +248,15 @@ public:
 
 class Reg : public Operand {
 	void operator=(const Reg&);
+	bool hasRex() const { return isExt8bit() | isREG(64) | isExtIdx(); }
 public:
 	Reg() { }
 	Reg(int idx, Kind kind, int bit = 0, int ext8bit = 0) : Operand(idx, kind, bit, ext8bit) { }
-	// reg = this
-	uint8 getRex(const Reg& index = Reg(), const Reg& base = Reg()) const
+	Reg changeBit(int bit) const { return Reg(getIdx(), getKind(), bit, isExt8bit()); }
+	bool isExtIdx() const { return getIdx() > 7; }
+	uint8 getRex(const Reg& base = Reg()) const
 	{
-		if ((!isExt8bit() && !index.isExt8bit() && !base.isExt8bit()) && (getIdx() | index.getIdx() | base.getIdx()) < 8) return 0;
-		return uint8(0x40 | ((getIdx() >> 3) << 2)| ((index.getIdx() >> 3) << 1) | (base.getIdx() >> 3));
+		return (hasRex() || base.hasRex()) ? uint8(0x40 | ((isREG(64) | base.isREG(64)) ? 8 : 0) | (isExtIdx() ? 4 : 0)| (base.isExtIdx() ? 1 : 0)) : 0;
 	}
 };
 
@@ -261,7 +281,19 @@ public:
 class Xmm : public Mmx {
 	void operator=(const Xmm&);
 public:
-	explicit Xmm(int idx) : Mmx(idx, Operand::XMM, 128) { }
+	explicit Xmm(int idx, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) { }
+};
+
+class Ymm : public Xmm {
+	void operator=(const Ymm&);
+public:
+	explicit Ymm(int idx) : Xmm(idx, Operand::YMM, 256) { }
+};
+
+class Fpu : public Reg {
+	void operator=(const Fpu&);
+public:
+	explicit Fpu(int idx) : Reg(idx, Operand::FPU, 32) { }
 };
 
 // register for addressing(32bit or 64bit)
@@ -307,7 +339,7 @@ private:
 	{
 		return operator+(r, -static_cast<int>(disp));
 	}
-	void operator=(const Reg32e&); // don't call
+	void operator=(const Reg32e&);
 public:
 	explicit Reg32e(int idx, int bit)
 		: Reg(idx, REG, bit)
@@ -362,7 +394,7 @@ struct RegRip {
 
 class CodeArray {
 	enum {
-		ALIGN_SIZE = 16,
+		ALIGN_PAGE_SIZE = 4096,
 		MAX_FIXED_BUF_SIZE = 8
 	};
 	enum Type {
@@ -381,13 +413,12 @@ protected:
 public:
 	CodeArray(size_t maxSize = MAX_FIXED_BUF_SIZE, void *userPtr = 0)
 		: type_(userPtr ? USER_BUF : maxSize <= MAX_FIXED_BUF_SIZE ? FIXED_BUF : ALLOC_BUF)
-		, allocPtr_(type_ == ALLOC_BUF ? new uint8[maxSize + ALIGN_SIZE] : 0)
+		, allocPtr_(type_ == ALLOC_BUF ? new uint8[maxSize + ALIGN_PAGE_SIZE] : 0)
 		, maxSize_(maxSize)
-		, top_(type_ == ALLOC_BUF ? getAlignedAddress(allocPtr_) : type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : buf_)
+		, top_(type_ == ALLOC_BUF ? getAlignedAddress(allocPtr_, ALIGN_PAGE_SIZE) : type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : buf_)
 		, size_(0)
 	{
 		if (type_ == ALLOC_BUF && !protect(top_, maxSize, true)) {
-//			fprintf(stderr, "can't protect (addr=%p, size=%u, canExec=%d)\n", addr, size, canExec);
 			throw ERR_CANT_PROTECT;
 		}
 	}
@@ -452,19 +483,19 @@ public:
 	/*
 		@param data [in] address of jmp data
 		@param disp [in] offset from the next of jmp
-		@param isShort [in] true if short jmp
+		@param size [in] write size(1, 2, 4, 8)
 	*/
-	void rewrite(uint8 *data, uint32 disp, bool isShort)
+	void rewrite(uint8 *data, uint64 disp, size_t size)
 	{
-		if (isShort) {
-			data[0] = static_cast<uint8>(disp);
-		} else {
-			data[0] = static_cast<uint8>(disp);
-			data[1] = static_cast<uint8>(disp >> 8);
-			data[2] = static_cast<uint8>(disp >> 16);
-			data[3] = static_cast<uint8>(disp >> 24);
+		if (size != 1 && size != 2 && size != 4 && size != 8) throw ERR_BAD_PARAMETER;
+		for (size_t i = 0; i < size; i++) {
+			data[i] = static_cast<uint8>(disp >> (i * 8));
 		}
 	}
+	void updateRegField(uint8 regIdx) const
+	{
+		*top_ = (*top_ & B11000111) | ((regIdx << 3) & B00111000);
+	}
 	/**
 		change exec permission of memory
 		@param addr [in] buffer address
@@ -474,15 +505,15 @@ public:
 	*/
 	static inline bool protect(const void *addr, size_t size, bool canExec)
 	{
-#ifdef __GNUC__
+#if defined(_WIN32)
+		DWORD oldProtect;
+		return VirtualProtect(const_cast<void*>(addr), size, canExec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldProtect) != 0;
+#elif defined(__GNUC__)
 		size_t pageSize = sysconf(_SC_PAGESIZE);
 		size_t iaddr = reinterpret_cast<size_t>(addr);
 		size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
 		int mode = PROT_READ | PROT_WRITE | (canExec ? PROT_EXEC : 0);
 		return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
-#elif defined(_WIN32)
-		DWORD oldProtect;
-		return VirtualProtect(const_cast<void*>(addr), size, canExec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldProtect) != 0;
 #else
 		return true;
 #endif
@@ -493,7 +524,7 @@ public:
 		@param alingedSize [in] power of two
 		@return aligned addr by alingedSize
 	*/
-	static inline uint8 *getAlignedAddress(uint8 *addr, size_t alignedSize = ALIGN_SIZE)
+	static inline uint8 *getAlignedAddress(uint8 *addr, size_t alignedSize = 16)
 	{
 		return reinterpret_cast<uint8*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) & ~(alignedSize - static_cast<size_t>(1)));
 	}
@@ -521,11 +552,7 @@ public:
 	uint64 getDisp() const { return disp_; }
 	uint8 getRex() const { return rex_; }
 	bool is64bitDisp() const { return is64bitDisp_; } // for moffset
-#ifdef XBYAK64
 	void setRex(uint8 rex) { rex_ = rex; }
-#else
-	void setRex(uint8) { }
-#endif
 };
 
 class AddressFrame {
@@ -536,7 +563,11 @@ public:
 	explicit AddressFrame(uint32 bit) : bit_(bit) { }
 	Address operator[](const void *disp) const
 	{
-		Reg32e r(Reg(), Reg(), 0, inner::GetPtrDist(disp));
+		size_t adr = reinterpret_cast<size_t>(disp);
+#ifdef XBYAK64
+		if (adr > 0xFFFFFFFFU) throw ERR_OFFSET_IS_TOO_BIG;
+#endif
+		Reg32e r(Reg(), Reg(), 0, static_cast<uint32>(adr));
 		return operator[](r);
 	}
 #ifdef XBYAK64
@@ -587,7 +618,8 @@ public:
 		} else if (mod == mod10 || (mod == mod00 && r.isNone())) {
 			frame.dd(r.disp_);
 		}
-		frame.setRex(Reg().getRex(r.index_, r));
+		uint8 rex = ((r.getIdx() | r.index_.getIdx()) < 8) ? 0 : uint8(0x40 | ((r.index_.getIdx() >> 3) << 1) | (r.getIdx() >> 3));
+		frame.setRex(rex);
 		return frame;
 	}
 };
@@ -600,6 +632,12 @@ struct JmpLabel {
 class Label {
 	CodeArray *base_;
 	int anonymousCount_; // for @@, @f, @b
+	enum {
+		maxStack = 10
+	};
+	int stack_[maxStack];
+	int stackPos_;
+	int usedCount_;
 	int localCount_; // for .***
 	typedef std::map<const std::string, const uint8*> DefinedList;
 	typedef std::multimap<const std::string, const JmpLabel> UndefinedList;
@@ -628,15 +666,22 @@ public:
 	Label()
 		: base_(0)
 		, anonymousCount_(0)
+		, stackPos_(1)
+		, usedCount_(0)
 		, localCount_(0)
 	{
 	}
-	void incLocalCount() { localCount_++; }
-	void decLocalCount() { localCount_--; }
-	void set(CodeArray *base)
+	void enterLocal()
 	{
-		base_ = base;
+		if (stackPos_ == maxStack) throw ERR_OVER_LOCAL_LABEL;
+		localCount_ = stack_[stackPos_++] = ++usedCount_;
 	}
+	void leaveLocal()
+	{
+		if (stackPos_ == 1) throw ERR_UNDER_LOCAL_LABEL;
+		localCount_ = stack_[--stackPos_ - 1];
+	}
+	void set(CodeArray *base) { base_ = base; }
 	void define(const char *label, const uint8 *address)
 	{
 		std::string newLabel(label);
@@ -657,8 +702,9 @@ public:
 			const JmpLabel *jmp = &itr->second;
 			uint32 disp = inner::GetPtrDist(address, jmp->endOfJmp);
 			if (jmp->isShort && !inner::IsInDisp8(disp)) throw ERR_LABEL_IS_TOO_FAR;
-			uint8 *data = jmp->endOfJmp - (jmp->isShort ? 1 : 4);
-			base_->rewrite(data, disp, jmp->isShort);
+			size_t jmpSize = jmp->isShort ? 1 : 4;
+			uint8 *data = jmp->endOfJmp - jmpSize;
+			base_->rewrite(data, disp, jmpSize);
 			undefinedList_.erase(itr);
 		}
 	}
@@ -689,22 +735,22 @@ public:
 	static inline std::string toStr(int num)
 	{
 		char buf[16];
-		static const char fmt[] = ".%08x";
 #ifdef _WIN32
 		#if _MSC_VER < 1400
-			_snprintf(buf, sizeof(buf), fmt, num);
+			_snprintf
 		#else
-			_snprintf_s(buf, sizeof(buf), fmt, num);
+			_snprintf_s
 		#endif
 #else
-		snprintf(buf, sizeof(buf), fmt, num);
+		snprintf
 #endif
+		(buf, sizeof(buf), ".%08x", num);
 		return buf;
 	}
 };
 
 class CodeGenerator : public CodeArray {
-protected:
+public:
 	enum LabelType {
 		T_SHORT,
 		T_NEAR,
@@ -747,35 +793,43 @@ private:
 	{
 		return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM());
 	}
-	void if16bit(const Operand& reg1, const Operand& reg2)
-	{
-		// except movsx(16bit, 32/64bit)
-		if ((reg1.isBit(16) && !reg2.isBit(i32e)) || (reg2.isBit(16) && !reg1.isBit(i32e))) db(0x66);
-	}
-	void rexAddr(const Address& addr, const Reg& reg = Reg())
-	{
-#ifdef XBYAK64
-		if (addr.is32bit_) db(0x67);
-#endif
-		if16bit(reg, addr);
-		uint32 rex = addr.getRex() | reg.getRex();
-		if (reg.isREG(64)) rex |= 0x48;
-		if (rex) db(rex);
-	}
 	void rex(const Operand& op1, const Operand& op2 = Operand())
 	{
-		if (op1.isMEM()) {
-			rexAddr(static_cast<const Address&>(op1), static_cast<const Reg&>(op2));
-		} else if (op2.isMEM()) {
-			rexAddr(static_cast<const Address&>(op2), static_cast<const Reg&>(op1));
+		uint8 rex = 0;
+		const Operand *p1 = &op1, *p2 = &op2;
+		if (p1->isMEM()) std::swap(p1, p2);
+		if (p1->isMEM()) throw ERR_BAD_COMBINATION;
+		if (p2->isMEM()) {
+			const Address& addr = static_cast<const Address&>(*p2);
+			if (BIT == 64 && addr.is32bit_) db(0x67);
+			rex = addr.getRex() | static_cast<const Reg&>(*p1).getRex();
 		} else {
-			const Reg& reg1 = static_cast<const Reg&>(op1);
-			const Reg& reg2 = static_cast<const Reg&>(op2);
 			// ModRM(reg, base);
-			if16bit(reg1, reg2);
-			uint8 rex = reg2.getRex(Reg(), reg1);
-			if (reg1.isREG(64) || reg2.isREG(64)) rex |= 0x48;
-			if (rex) db(rex);
+			rex = static_cast<const Reg&>(op2).getRex(static_cast<const Reg&>(op1));
+		}
+		// except movsx(16bit, 32/64bit)
+		if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66);
+		if (rex) db(rex);
+	}
+	enum AVXtype {
+		PP_NONE = 1 << 0,
+		PP_66 = 1 << 1,
+		PP_F3 = 1 << 2,
+		PP_F2 = 1 << 3,
+		MM_RESERVED = 1 << 4,
+		MM_0F = 1 << 5,
+		MM_0F38 = 1 << 6,
+		MM_0F3A = 1 << 7
+	};
+	void vex(bool r, int idx, bool is256, int type, bool x = false, bool b = false, int w = 1)
+	{
+		uint32 pp = (type & PP_66) ? 1 : (type & PP_F3) ? 2 : (type & PP_F2) ? 3 : 0;
+		uint32 vvvv = (((~idx) & 15) << 3) | (is256 ? 4 : 0) | pp;
+		if (!b && !x && !w && (type & MM_0F)) {
+			db(0xC5); db((r ? 0 : 0x80) | vvvv);
+		} else {
+			uint32 mmmm = (type & MM_0F) ? 1 : (type & MM_0F38) ? 2 : (type & MM_0F3A) ? 3 : 0;
+			db(0xC4); db((r ? 0 : 0x80) | (x ? 0 : 0x40) | (b ? 0 : 0x20) | mmmm); db((w << 7) | vvvv);
 		}
 	}
 	Label label_;
@@ -792,10 +846,8 @@ private:
 		if (addr.is64bitDisp()) throw ERR_CANT_USE_64BIT_DISP;
 		rex(addr, reg);
 		db(code0 | (reg.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2);
-		uint8 t = *addr.getCode();
-		assert((t & ~0xC7) == 0); /* 0b11000111 */
-		db(t | ((reg.getIdx() & 7) << 3)); // update reg field
-		db(addr.getCode() + 1, static_cast<int>(addr.getSize()) - 1);
+		addr.updateRegField(static_cast<uint8>(reg.getIdx()));
+		db(addr.getCode(), static_cast<int>(addr.getSize()));
 	}
 	void opJmp(const char *label, LabelType type, uint8 shortCode, uint8 longCode, uint8 longPref)
 	{
@@ -835,13 +887,13 @@ private:
 		if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) {
 			db(shortCode);
 			db(0);
-			rewrite(top + shortHeaderSize, disp - shortJmpSize, true);
+			rewrite(top + shortHeaderSize, disp - shortJmpSize, 1);
 		} else {
 			if (type == T_SHORT) throw ERR_LABEL_IS_TOO_FAR;
 			if (longPref) db(longPref);
 			db(longCode);
 			dd(0);
-			rewrite(top + longHeaderSize, disp - longJmpSize, false);
+			rewrite(top + longHeaderSize, disp - longJmpSize, 4);
 		}
 	}
 	/* preCode is for SSSE3/SSE4 */
@@ -864,8 +916,7 @@ private:
 	}
 	void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE)
 	{
-		pref = mmx.isXMM() ? pref : NONE;
-		opGen(mmx, op, code, pref, isXMMorMMX_MEM, imm8, preCode);
+		opGen(mmx, op, code, mmx.isXMM() ? pref : NONE, isXMMorMMX_MEM, imm8, preCode);
 	}
 	void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref)
 	{
@@ -887,14 +938,14 @@ private:
 			opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, B00111010);
 		}
 	}
-	void opR_ModM(const Operand& op, int bit, uint8 mod, int ext, int code0, int code1 = NONE, int code2 = NONE)
+	void opR_ModM(const Operand& op, int bit, int ext, int code0, int code1 = NONE, int code2 = NONE, bool disableRex = false)
 	{
+		int opBit = op.getBit();
+		if (disableRex && opBit == 64) opBit = 32;
 		if (op.isREG(bit)) {
-			rex(op);
-			db(code0 | (op.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2);
-			db(getModRM(mod, ext, op.getIdx()));
+			opModR(Reg(ext, Operand::REG, opBit), static_cast<const Reg&>(op).changeBit(opBit), code0, code1, code2);
 		} else if (op.isMEM()) {
-			opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code0, code1, code2);
+			opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, opBit), code0, code1, code2);
 		} else {
 			throw ERR_BAD_COMBINATION;
 		}
@@ -902,13 +953,13 @@ private:
 	void opShift(const Operand& op, int imm, int ext)
 	{
 		verifyMemHasSize(op);
-		opR_ModM(op, 0, 3, ext, (B11000000 | ((imm == 1 ? 1 : 0) << 4)));
+		opR_ModM(op, 0, ext, (B11000000 | ((imm == 1 ? 1 : 0) << 4)));
 		if (imm != 1) db(imm);
 	}
 	void opShift(const Operand& op, const Reg8& cl, int ext)
 	{
 		if (cl.getIdx() != Operand::CL) throw ERR_BAD_COMBINATION;
-		opR_ModM(op, 0, 3, ext, B11010010);
+		opR_ModM(op, 0, ext, B11010010);
 	}
 	void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE, int code2 = NONE)
 	{
@@ -941,20 +992,19 @@ private:
 		verifyMemHasSize(op);
 		uint32 immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
 		if (op.getBit() < immBit) throw ERR_IMM_IS_TOO_BIG;
-		if (op.isREG()) {
-			if (immBit == 16 && op.isBit(32)) immBit = 32; /* don't use MEM16 if 32bit mode */
-		}
+		if (op.isREG(32|64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
 		if (op.isREG() && op.getIdx() == 0 && (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) { // rax, eax, ax, al
 			rex(op);
 			db(code | 4 | (immBit == 8 ? 0 : 1));
 		} else {
-			int tmp = (op.getBit() > immBit && 32 > immBit) ? 2 : 0;
-			opR_ModM(op, 0, 3, ext, B10000000 | tmp);
+			int tmp = immBit < (std::min)(op.getBit(), 32U) ? 2 : 0;
+			opR_ModM(op, 0, ext, B10000000 | tmp);
 		}
 		db(imm, immBit / 8);
 	}
 	void opIncDec(const Operand& op, int code, int ext)
 	{
+		verifyMemHasSize(op);
 #ifndef XBYAK64
 		if (op.isREG() && !op.isBit(8)) {
 			rex(op); db(code | op.getIdx());
@@ -964,21 +1014,15 @@ private:
 		code = B11111110;
 		if (op.isREG()) {
 			opModR(Reg(ext, Operand::REG, op.getBit()), static_cast<const Reg&>(op), code);
-		} else if (op.isMEM() && op.getBit() > 0) {
-			opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
 		} else {
-			throw ERR_BAD_COMBINATION;
+			opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
 		}
 	}
 	void opPushPop(const Operand& op, int code, int ext, int alt)
 	{
 		if (op.isREG()) {
-#ifdef XBYAK64
 			if (op.isBit(16)) db(0x66);
 			if (static_cast<const Reg&>(op).getIdx() >= 8) db(0x41);
-#else
-			rex(op);
-#endif
 			db(alt | (op.getIdx() & 7));
 		} else if (op.isMEM()) {
 			opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
@@ -990,16 +1034,51 @@ private:
 	{
 		if (op.isMEM() && op.getBit() == 0) throw ERR_MEM_SIZE_IS_NOT_SPECIFIED;
 	}
-protected:
+	void opMovxx(const Reg& reg, const Operand& op, uint8 code)
+	{
+		int w = op.isBit(16);
+		bool cond = reg.isREG() && (reg.getBit() > op.getBit());
+		opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
+	}
+	void opFpuMem(const Address& addr, uint8 m16, uint8 m32, uint8 m64, uint8 ext, uint8 m64ext)
+	{
+		if (addr.is64bitDisp()) throw ERR_CANT_USE_64BIT_DISP;
+		uint8 code = addr.isBit(16) ? m16 : addr.isBit(32) ? m32 : addr.isBit(64) ? m64 : 0;
+		if (!code) throw ERR_BAD_MEM_SIZE;
+		if (m64ext && addr.isBit(64)) ext = m64ext;
+
+		rex(addr, st0);
+		db(code);
+		addr.updateRegField(ext);
+		db(addr.getCode(), static_cast<int>(addr.getSize()));
+	}
+	// like yasm not nasm
+	// use code1 if reg1 == st0
+	// use code2 if reg1 != st0 && reg2 == st0
+	void opFpuFpu(const Fpu& reg1, const Fpu& reg2, uint32 code1, uint32 code2)
+	{
+		uint32 code = reg1.getIdx() == 0 ? code1 : reg2.getIdx() == 0 ? code2 : 0;
+		if (!code) throw ERR_BAD_ST_COMBINATION;
+		db(uint8(code >> 8));
+		db(uint8(code | (reg1.getIdx() | reg2.getIdx())));
+	}
+	void opFpu(const Fpu& reg, uint8 code1, uint8 code2)
+	{
+		db(code1); db(code2 | reg.getIdx());
+	}
+public:
 	unsigned int getVersion() const { return VERSION; }
 	using CodeArray::db;
 	const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
 	const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+	const Ymm ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
 	const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7;
+	const Ymm &ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7;
 	const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi;
 	const Reg16 ax, cx, dx, bx, sp, bp, si, di;
 	const Reg8 al, cl, dl, bl, ah, ch, dh, bh;
-	const AddressFrame ptr, byte, word, dword, qword, xmmword;
+	const AddressFrame ptr, byte, word, dword, qword;
+	const Fpu st0, st1, st2, st3, st4, st5, st6, st7;
 #ifdef XBYAK64
 	const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15;
 	const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d;
@@ -1007,7 +1086,9 @@ protected:
 	const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b;
 	const Reg8 spl, bpl, sil, dil;
 	const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-	const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15;
+	const Ymm ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+	const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15; // for my convenience
+	const Ymm &ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15;
 	const RegRip rip;
 #endif
 
@@ -1015,8 +1096,8 @@ protected:
 	{
 		label_.define(label, getCurr());
 	}
-	void inLocalLabel() { label_.incLocalCount(); }
-	void outLocalLabel() { label_.decLocalCount(); }
+	void inLocalLabel() { label_.enterLocal(); }
+	void outLocalLabel() { label_.leaveLocal(); }
 	void jmp(const char *label, LabelType type = T_AUTO)
 	{
 		opJmp(label, type, B11101011, B11101001, 0);
@@ -1027,7 +1108,11 @@ protected:
 	}
 	void jmp(const Operand& op)
 	{
-		opR_ModM(op, i32e, 3, 4, 0xFF);
+		opR_ModM(op, BIT, 4, 0xFF, NONE, NONE, true);
+	}
+	void call(const Operand& op)
+	{
+		opR_ModM(op, 16 | i32e, 2, 0xFF, NONE, NONE, true);
 	}
 	// (REG|MEM, REG)
 	void test(const Operand& op, const Reg& reg)
@@ -1042,10 +1127,9 @@ protected:
 			rex(op);
 			db(B10101000 | (op.isBit(8) ? 0 : 1));
 		} else {
-			opR_ModM(op, 0, 3, 0, B11110110);
+			opR_ModM(op, 0, 0, B11110110);
 		}
-		int size = op.getBit() / 8; if (size > 4) size = 4;
-		db(imm, size);
+		db(imm, (std::min)(op.getBit() / 8, 4U));
 	}
 	void ret(int imm = 0)
 	{
@@ -1134,24 +1218,39 @@ protected:
 			opRM_RM(reg1, reg2, B10001000);
 		}
 	}
-	void mov(const Operand& op, uint64 imm)
+	void mov(const Operand& op,
+#ifdef XBYAK64
+	uint64
+#else
+	uint32
+#endif
+	imm)
 	{
 		verifyMemHasSize(op);
 		if (op.isREG()) {
-			int w = op.isBit(8) ? 0 : 1;
-			rex(op); db(B10110000 | (w << 3) | (op.getIdx() & 7));
+			rex(op);
+			int code, size;
+#ifdef XBYAK64
+			if (op.isBit(64) && inner::IsInInt32(imm)) {
+				db(B11000111);
+				code = B11000000;
+				size = 4;
+			} else
+#endif
+			{
+				code = B10110000 | ((op.isBit(8) ? 0 : 1) << 3);
+				size = op.getBit() / 8;
+			}
+
+			db(code | (op.getIdx() & 7));
+			db(imm, size);
 		} else if (op.isMEM()) {
 			opModM(static_cast<const Address&>(op), Reg(0, Operand::REG, op.getBit()), B11000110);
+			int size = op.getBit() / 8; if (size > 4) size = 4;
+			db(static_cast<uint32>(imm), size);
 		} else {
 			throw ERR_BAD_COMBINATION;
 		}
-		db(imm, op.getBit() / 8);
-	}
-	void opMovxx(const Reg& reg, const Operand& op, uint8 code)
-	{
-		int w = op.isBit(16);
-		bool cond = reg.isREG() && (reg.getBit() > op.getBit());
-		opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
 	}
 	void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, B11000111); }
 #ifdef XBYAK64
@@ -1180,20 +1279,17 @@ protected:
 	}
 	void call(const char *label)
 	{
-		opJmp(label, T_NEAR, 0, B10011010, 0);
+		opJmp(label, T_NEAR, 0, B11101000, 0);
 	}
 	void call(const void *addr)
 	{
 		opJmp(addr, T_NEAR, 0, B11101000, 0);
 	}
-	void call(const Operand& op)
-	{
-		opR_ModM(op, 16 | i32e, 3, 2, B11111111);
-	}
 	// special case
 	void movd(const Address& addr, const Mmx& mmx)
 	{
-		opModM(addr, Reg(mmx.getIdx(), Operand::REG, mmx.getBit() / 8), 0x0F, B01111110);
+		if (mmx.isXMM()) db(0x66);
+		opModM(addr, mmx, 0x0F, B01111110);
 	}
 	void movd(const Reg32& reg, const Mmx& mmx)
 	{
@@ -1202,8 +1298,8 @@ protected:
 	}
 	void movd(const Mmx& mmx, const Address& addr)
 	{
-		ASSERT(!addr.isBit(32)); // don't use dword ptr, bogus, won't output 0x66 for xmm dest op
-		opModM(addr, Reg(mmx.getIdx(), Operand::REG, mmx.getBit() / 8), 0x0F, B01101110);
+		if (mmx.isXMM()) db(0x66);
+		opModM(addr, mmx, 0x0F, B01101110);
 	}
 	void movd(const Mmx& mmx, const Reg32& reg)
 	{
@@ -1225,8 +1321,31 @@ protected:
 	}
 	void movq(const Address& addr, const Mmx& mmx)
 	{
-		opModM(addr, Reg(mmx.getIdx(), Operand::REG, mmx.getBit() / 8), 0x0F, mmx.isXMM() ? B11010110 : B01111111);
+		if (mmx.isXMM()) db(0x66);
+		opModM(addr, mmx, 0x0F, mmx.isXMM() ? B11010110 : B01111111);
 	}
+#ifdef XBYAK64
+	void movq(const Reg64& reg, const Mmx& mmx)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModR(mmx, reg, 0x0F, B01111110);
+	}
+	void movq(const Mmx& mmx, const Reg64& reg)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModR(mmx, reg, 0x0F, B01101110);
+	}
+	void pextrq(const Operand& op, const Xmm& xmm, uint8 imm)
+	{
+		if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION;
+		opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, B00111010); // force to 64bit
+	}
+	void pinsrq(const Xmm& xmm, const Operand& op, uint8 imm)
+	{
+		if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION;
+		opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, B00111010); // force to 64bit
+	}
+#endif
 	// MMX2 : pextrw : reg, mmx/xmm, imm
 	// SSE4 : pextrw, pextrb, pextrd, extractps : reg/mem, mmx/xmm, imm
 	void pextrw(const Operand& op, const Mmx& xmm, uint8 imm) { opExt(op, xmm, 0x15, imm, true); }
@@ -1270,7 +1389,7 @@ protected:
 		bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
 		if (!is16bit && !(reg.isREG(i32e) && (op.isREG(i32e) || op.isMEM()))) throw ERR_BAD_COMBINATION;
 		if (is16bit) db(0x66);
-		db(0xF3); opModRM(Reg(reg.getIdx(), Operand::REG, i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, 0x0F, 0xB8);
+		db(0xF3); opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, 0x0F, 0xB8);
 	}
 	void crc32(const Reg32e& reg, const Operand& op)
 	{
@@ -1278,17 +1397,86 @@ protected:
 		db(0xF2);
 		opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1));
 	}
-public:
+	void vextractps(const Operand& op, const Xmm& xmm, uint8 imm)
+	{
+		if (!(op.isREG(32) || op.isMEM()) || xmm.isYMM()) throw ERR_BAD_COMBINATION;
+		opAVX_X_XM_IMM(xmm, cvtReg(op, op.isREG(), Operand::XMM), MM_0F3A | PP_66, 0x17, false, 0, imm);
+	}
+	// support (x, x, x/m), (y, y, y/m)
+	void opAVX_X_X_XM(const Xmm& xm1, const Operand& op1, const Operand& op2, int type, int code0, bool supportYMM, int w = -1)
+	{
+		const Xmm *xm2;
+		const Operand *op;
+		if (op2.isNone()) {
+			xm2 = &xm1;
+			op = &op1;
+		} else {
+			if (!(op1.isXMM() || (supportYMM && op1.isYMM()))) throw ERR_BAD_COMBINATION;
+			xm2 = static_cast<const Xmm*>(&op1);
+			op = &op2;
+		}
+		// (xm1, xm2, op)
+		if (!((xm1.isXMM() && xm2->isXMM()) || (supportYMM && xm1.isYMM() && xm2->isYMM()))) throw ERR_BAD_COMBINATION;
+		bool x, b;
+		if (op->isMEM()) {
+			const Address& addr = *static_cast<const Address*>(op);
+			uint8 rex = addr.getRex();
+			x = (rex & 2) != 0;
+			b = (rex & 1) != 0;
+			if (BIT == 64 && addr.is32bit_) db(0x67);
+			if (BIT == 64 && w == -1) w = (rex & 4) ? 1 : 0;
+		} else {
+			x = false;
+			b = static_cast<const Reg*>(op)->isExtIdx();
+		}
+		if (w == -1) w = 0;
+		vex(xm1.isExtIdx(), xm2->getIdx(), xm1.isYMM(), type, x, b, w);
+		db(code0);
+		if (op->isMEM()) {
+			const Address& addr = *static_cast<const Address*>(op);
+			addr.updateRegField(static_cast<uint8>(xm1.getIdx()));
+			db(addr.getCode(), static_cast<int>(addr.getSize()));
+		} else {
+			db(getModRM(3, xm1.getIdx(), op->getIdx()));
+		}
+	}
+	// if cvt then return pointer to Xmm(idx) (or Ymm(idx)), otherwise return op
+	const Operand& cvtReg(const Operand& op, bool cvt, Operand::Kind kind) const
+	{
+		if (!cvt) return op;
+		static const Xmm* xmTbl[] = {
+			&xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7,
+#ifdef XBYAK64
+			&xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15
+#endif
+		};
+		static const Ymm* ymTbl[] = {
+			&ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7,
+#ifdef XBYAK64
+			&ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15
+#endif
+		};
+		return (kind == Operand::XMM) ? *xmTbl[op.getIdx()] : *ymTbl[op.getIdx()];
+	}
+	// support (x, x/m, imm), (y, y/m, imm)
+	void opAVX_X_XM_IMM(const Xmm& xmm, const Operand& op, int type, int code, bool supportYMM, int w = -1, int imm = NONE)
+	{
+		opAVX_X_X_XM(xmm, xmm.isXMM() ? xm0 : ym0, op, type, code, supportYMM, w); if (imm != NONE) db((uint8)imm);
+	}
 	enum { NONE = 256 };
+public:
 	CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = 0)
 		: CodeArray(maxSize, userPtr)
 		, mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7)
 		, xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7)
+		, ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7)
 		, xm0(xmm0), xm1(xmm1), xm2(xmm2), xm3(xmm3), xm4(xmm4), xm5(xmm5), xm6(xmm6), xm7(xmm7) // for my convenience
+		, ym0(ymm0), ym1(ymm1), ym2(ymm2), ym3(ymm3), ym4(ymm4), ym5(ymm5), ym6(ymm6), ym7(ymm7) // for my convenience
 		, eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX), esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI)
 		, ax(Operand::EAX), cx(Operand::ECX), dx(Operand::EDX), bx(Operand::EBX), sp(Operand::ESP), bp(Operand::EBP), si(Operand::ESI), di(Operand::EDI)
 		, al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH), ch(Operand::CH), dh(Operand::DH), bh(Operand::BH)
-		, ptr(0), byte(8), word(16), dword(32), qword(64), xmmword(128)
+		, ptr(0), byte(8), word(16), dword(32), qword(64)
+		, st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7)
 #ifdef XBYAK64
 		, rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX), rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9), r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15)
 		, r8d(Operand::R8D), r9d(Operand::R9D), r10d(Operand::R10D), r11d(Operand::R11D), r12d(Operand::R12D), r13d(Operand::R13D), r14d(Operand::R14D), r15d(Operand::R15D)
@@ -1296,7 +1484,9 @@ public:
 		, r8b(Operand::R8B), r9b(Operand::R9B), r10b(Operand::R10B), r11b(Operand::R11B), r12b(Operand::R12B), r13b(Operand::R13B), r14b(Operand::R14B), r15b(Operand::R15B)
 		, spl(Operand::SPL, 1), bpl(Operand::BPL, 1), sil(Operand::SIL, 1), dil(Operand::DIL, 1)
 		, xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15)
+		, ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15)
 		, xm8(xmm8), xm9(xmm9), xm10(xmm10), xm11(xmm11), xm12(xmm12), xm13(xmm13), xm14(xmm14), xm15(xmm15) // for my convenience
+		, ym8(ymm8), ym9(ymm9), ym10(ymm10), ym11(ymm11), ym12(ymm12), ym13(ymm13), ym14(ymm14), ym15(ymm15) // for my convenience
 		, rip()
 #endif
 	{
@@ -1309,7 +1499,7 @@ public:
 //		if (hasUndefinedLabel()) throw ERR_LABEL_IS_NOT_FOUND;
 		return top_;
 	}
-#ifdef TEST_NM
+#ifdef XBYAK_TEST
 	void dump(bool doClear = true)
 	{
 		CodeArray::dump();
@@ -1322,7 +1512,7 @@ public:
 	void align(int x = 16)
 	{
 		if (x != 4 && x != 8 && x != 16 && x != 32) throw ERR_BAD_ALIGN;
-		while (inner::GetPtrDist(getCurr()) % x) {
+		while (size_t(getCurr()) % x) {
 			nop();
 		}
 	}
@@ -1335,4 +1525,4 @@ public:
 
 } // end of namespace
 
-#endif // XBYAK_H_
+#endif // XBYAK_XBYAK_H_
diff --git a/plugins/GSdx/xbyak/xbyak_mnemonic.h b/plugins/GSdx/xbyak/xbyak_mnemonic.h
index e82d422b35..5a6e33484a 100644
--- a/plugins/GSdx/xbyak/xbyak_mnemonic.h
+++ b/plugins/GSdx/xbyak/xbyak_mnemonic.h
@@ -1,4 +1,4 @@
-const char *getVersionString() const { return "2.07"; }
+const char *getVersionString() const { return "2.99"; }
 void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
 void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
 void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); }
@@ -184,88 +184,94 @@ void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0
 void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); }
 void cmovo(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 0); }
 void jo(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }
-void seto(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 0); }
+void seto(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 0); }
 void cmovno(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 1); }
 void jno(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }
-void setno(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 1); }
+void setno(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 1); }
 void cmovb(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
 void jb(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
-void setb(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 2); }
+void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
+void cmovc(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
+void jc(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
+void setc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
 void cmovnae(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
 void jnae(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
-void setnae(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 2); }
+void setnae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
 void cmovnb(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
 void jnb(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
-void setnb(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 3); }
+void setnb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
 void cmovae(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
 void jae(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
-void setae(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 3); }
+void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
+void cmovnc(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
+void jnc(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
+void setnc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
 void cmove(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 4); }
 void je(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }
-void sete(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 4); }
+void sete(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 4); }
 void cmovz(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 4); }
 void jz(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }
-void setz(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 4); }
+void setz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 4); }
 void cmovne(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 5); }
 void jne(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }
-void setne(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 5); }
+void setne(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 5); }
 void cmovnz(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 5); }
 void jnz(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }
-void setnz(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 5); }
+void setnz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 5); }
 void cmovbe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 6); }
 void jbe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }
-void setbe(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 6); }
+void setbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 6); }
 void cmovna(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 6); }
 void jna(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }
-void setna(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 6); }
+void setna(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 6); }
 void cmovnbe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 7); }
 void jnbe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }
-void setnbe(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 7); }
+void setnbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 7); }
 void cmova(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 7); }
 void ja(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }
-void seta(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 7); }
+void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 7); }
 void cmovs(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 8); }
 void js(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }
-void sets(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 8); }
+void sets(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 8); }
 void cmovns(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 9); }
 void jns(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }
-void setns(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 9); }
+void setns(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 9); }
 void cmovp(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 10); }
 void jp(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }
-void setp(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 10); }
+void setp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 10); }
 void cmovpe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 10); }
 void jpe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }
-void setpe(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 10); }
+void setpe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 10); }
 void cmovnp(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 11); }
 void jnp(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }
-void setnp(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 11); }
+void setnp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 11); }
 void cmovpo(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 11); }
 void jpo(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }
-void setpo(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 11); }
+void setpo(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 11); }
 void cmovl(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 12); }
 void jl(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }
-void setl(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 12); }
+void setl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 12); }
 void cmovnge(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 12); }
 void jnge(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }
-void setnge(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 12); }
+void setnge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 12); }
 void cmovnl(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 13); }
 void jnl(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }
-void setnl(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 13); }
+void setnl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 13); }
 void cmovge(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 13); }
 void jge(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }
-void setge(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 13); }
+void setge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 13); }
 void cmovle(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 14); }
 void jle(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }
-void setle(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 14); }
+void setle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 14); }
 void cmovng(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 14); }
 void jng(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }
-void setng(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 14); }
+void setng(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 14); }
 void cmovnle(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 15); }
 void jnle(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }
-void setnle(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 15); }
+void setnle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 15); }
 void cmovg(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 15); }
 void jg(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }
-void setg(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 15); }
+void setg(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 15); }
 #ifdef XBYAK64
 void cdqe() { db(0x48); db(0x98); }
 #else
@@ -308,12 +314,57 @@ void mwait() { db(0x0F); db(0x01); db(0xC9); }
 void rdmsr() { db(0x0F); db(0x32); }
 void rdpmc() { db(0x0F); db(0x33); }
 void rdtsc() { db(0x0F); db(0x31); }
+void rdtscp() { db(0x0F); db(0x01); db(0xF9); }
 void wait() { db(0x9B); }
 void wbinvd() { db(0x0F); db(0x09); }
 void wrmsr() { db(0x0F); db(0x30); }
 void xlatb() { db(0xD7); }
 void popf() { db(0x9D); }
 void pushf() { db(0x9C); }
+void vzeroall() { db(0xC5); db(0xFC); db(0x77); }
+void vzeroupper() { db(0xC5); db(0xF8); db(0x77); }
+void xgetbv() { db(0x0F); db(0x01); db(0xD0); }
+void f2xm1() { db(0xD9); db(0xF0); }
+void fabs() { db(0xD9); db(0xE1); }
+void faddp() { db(0xDE); db(0xC1); }
+void fchs() { db(0xD9); db(0xE0); }
+void fcom() { db(0xD8); db(0xD1); }
+void fcomp() { db(0xD8); db(0xD9); }
+void fcompp() { db(0xDE); db(0xD9); }
+void fcos() { db(0xD9); db(0xFF); }
+void fdecstp() { db(0xD9); db(0xF6); }
+void fdivp() { db(0xDE); db(0xF9); }
+void fdivrp() { db(0xDE); db(0xF1); }
+void fincstp() { db(0xD9); db(0xF7); }
+void fld1() { db(0xD9); db(0xE8); }
+void fldl2t() { db(0xD9); db(0xE9); }
+void fldl2e() { db(0xD9); db(0xEA); }
+void fldpi() { db(0xD9); db(0xEB); }
+void fldlg2() { db(0xD9); db(0xEC); }
+void fldln2() { db(0xD9); db(0xED); }
+void fldz() { db(0xD9); db(0xEE); }
+void fmulp() { db(0xDE); db(0xC9); }
+void fnop() { db(0xD9); db(0xD0); }
+void fpatan() { db(0xD9); db(0xF3); }
+void fprem() { db(0xD9); db(0xF8); }
+void fprem1() { db(0xD9); db(0xF5); }
+void fptan() { db(0xD9); db(0xF2); }
+void frndint() { db(0xD9); db(0xFC); }
+void fscale() { db(0xD9); db(0xFD); }
+void fsin() { db(0xD9); db(0xFE); }
+void fsincos() { db(0xD9); db(0xFB); }
+void fsqrt() { db(0xD9); db(0xFA); }
+void fsubp() { db(0xDE); db(0xE9); }
+void fsubrp() { db(0xDE); db(0xE1); }
+void ftst() { db(0xD9); db(0xE4); }
+void fucom() { db(0xDD); db(0xE1); }
+void fucomp() { db(0xDD); db(0xE9); }
+void fucompp() { db(0xDA); db(0xE9); }
+void fxam() { db(0xD9); db(0xE5); }
+void fxch() { db(0xD9); db(0xC9); }
+void fxtract() { db(0xD9); db(0xF4); }
+void fyl2x() { db(0xD9); db(0xF1); }
+void fyl2xp1() { db(0xD9); db(0xF9); }
 void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
 void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
 void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); }
@@ -332,12 +383,12 @@ void xor(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
 void xor(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x30, 6); }
 void dec(const Operand& op) { opIncDec(op, 0x48, 1); }
 void inc(const Operand& op) { opIncDec(op, 0x40, 0); }
-void div(const Operand& op) { opR_ModM(op, 0, 3, 6, 0xF6); }
-void idiv(const Operand& op) { opR_ModM(op, 0, 3, 7, 0xF6); }
-void imul(const Operand& op) { opR_ModM(op, 0, 3, 5, 0xF6); }
-void mul(const Operand& op) { opR_ModM(op, 0, 3, 4, 0xF6); }
-void neg(const Operand& op) { opR_ModM(op, 0, 3, 3, 0xF6); }
-void not(const Operand& op) { opR_ModM(op, 0, 3, 2, 0xF6); }
+void div(const Operand& op) { opR_ModM(op, 0, 6, 0xF6); }
+void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
+void imul(const Operand& op) { opR_ModM(op, 0, 5, 0xF6); }
+void mul(const Operand& op) { opR_ModM(op, 0, 4, 0xF6); }
+void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
+void not(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
 void rcl(const Operand& op, int imm) { opShift(op, imm, 2); }
 void rcl(const Operand& op, const Reg8& cl) { opShift(op, cl, 2); }
 void rcr(const Operand& op, int imm) { opShift(op, imm, 3); }
@@ -360,52 +411,57 @@ void shrd(const Operand& op, const Reg& reg, uint8 imm) { opShxd(op, reg, imm, 0
 void shrd(const Operand& op, const Reg& reg, const Reg8& cl) { opShxd(op, reg, 0, 0xAC, &cl); }
 void bsf(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); }
 void bsr(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); }
-void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, 256, 0x38); }
-void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, 256, 0x38); }
-void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, 256, 0x38); }
-void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, 256, 0x38); }
-void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, 256, 0x38); }
-void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, 256, 0x38); }
-void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, 256, 0x38); }
-void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, 256, 0x38); }
-void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, 256, 0x38); }
-void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, 256, 0x38); }
-void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, 256, 0x38); }
-void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, 256, 0x38); }
-void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, 256, 0x38); }
-void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, 256, 0x38); }
-void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, 256, 0x38); }
+void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
+void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, NONE, 0x38); }
+void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, NONE, 0x38); }
+void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, NONE, 0x38); }
+void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, NONE, 0x38); }
+void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, NONE, 0x38); }
+void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, NONE, 0x38); }
+void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, NONE, 0x38); }
+void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, NONE, 0x38); }
+void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, NONE, 0x38); }
+void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, NONE, 0x38); }
+void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, NONE, 0x38); }
+void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, NONE, 0x38); }
+void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, NONE, 0x38); }
+void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, NONE, 0x38); }
 void palignr(const Mmx& mmx, const Operand& op, int imm) { opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8>(imm), 0x3a); }
-void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, 256, 0x38); }
-void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, 256, 0x38); }
+void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesdec(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDE, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesdeclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesenc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDC, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesenclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDD, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesimc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDB, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
 void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
 void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
 void dppd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
@@ -420,6 +476,8 @@ void pcmpestrm(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x60
 void pcmpestri(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
 void pcmpistrm(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
 void pcmpistri(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void aeskeygenassist(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
 void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); }
 void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); }
 void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
@@ -427,3 +485,540 @@ void movntpd(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getId
 void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); }
 void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
 void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
+void fadd(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 0, 0); }
+void fiadd(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 0, 0); }
+void fcom(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 2, 0); }
+void fcomp(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 3, 0); }
+void fdiv(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 6, 0); }
+void fidiv(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 6, 0); }
+void fdivr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 7, 0); }
+void fidivr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 7, 0); }
+void ficom(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 2, 0); }
+void ficomp(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 3, 0); }
+void fild(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 0, 5); }
+void fist(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0x00, 2, 0); }
+void fistp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 3, 7); }
+void fisttp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDD, 1, 0); }
+void fld(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 0, 0); }
+void fmul(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 1, 0); }
+void fimul(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 1, 0); }
+void fst(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 2, 0); }
+void fstp(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 3, 0); }
+void fsub(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 4, 0); }
+void fisub(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 4, 0); }
+void fsubr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 5, 0); }
+void fisubr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 5, 0); }
+void fadd(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C0, 0xDCC0); }
+void faddp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC0); }
+void fcmovb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC0, 0x00C0); }
+void fcmove(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC8, 0x00C8); }
+void fcmovbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD0, 0x00D0); }
+void fcmovu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD8, 0x00D8); }
+void fcmovnb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC0, 0x00C0); }
+void fcmovne(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC8, 0x00C8); }
+void fcmovnbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD0, 0x00D0); }
+void fcmovnu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD8, 0x00D8); }
+void fcomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBF0, 0x00F0); }
+void fcomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFF0, 0x00F0); }
+void fucomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBE8, 0x00E8); }
+void fucomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFE8, 0x00E8); }
+void fdiv(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F0, 0xDCF8); }
+void fdivp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF8); }
+void fdivr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F8, 0xDCF0); }
+void fdivrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF0); }
+void fmul(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C8, 0xDCC8); }
+void fmulp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC8); }
+void fsub(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E0, 0xDCE8); }
+void fsubp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE8); }
+void fsubr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E8, 0xDCE0); }
+void fsubrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE0); }
+void fcom(const Fpu& reg) { opFpu(reg, 0xD8, 0xD0); }
+void fcomp(const Fpu& reg) { opFpu(reg, 0xD8, 0xD8); }
+void ffree(const Fpu& reg) { opFpu(reg, 0xDD, 0xC0); }
+void fld(const Fpu& reg) { opFpu(reg, 0xD9, 0xC0); }
+void fst(const Fpu& reg) { opFpu(reg, 0xDD, 0xD0); }
+void fstp(const Fpu& reg) { opFpu(reg, 0xDD, 0xD8); }
+void fucom(const Fpu& reg) { opFpu(reg, 0xDD, 0xE0); }
+void fucomp(const Fpu& reg) { opFpu(reg, 0xDD, 0xE8); }
+void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); }
+void vaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x58, true); }
+void vaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x58, true); }
+void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x58, false); }
+void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x58, false); }
+void vsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5C, true); }
+void vsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5C, true); }
+void vsubsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5C, false); }
+void vsubss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5C, false); }
+void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x59, true); }
+void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x59, true); }
+void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x59, false); }
+void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x59, false); }
+void vdivpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5E, true); }
+void vdivps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5E, true); }
+void vdivsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5E, false); }
+void vdivss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5E, false); }
+void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5F, true); }
+void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5F, true); }
+void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5F, false); }
+void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5F, false); }
+void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5D, true); }
+void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5D, true); }
+void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5D, false); }
+void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5D, false); }
+void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x54, true); }
+void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x54, true); }
+void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x55, true); }
+void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x55, true); }
+void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x56, true); }
+void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x56, true); }
+void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x57, true); }
+void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x57, true); }
+void vblendpd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0D, true, 0); db(imm); }
+void vblendpd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0D, true, 0); db(imm); }
+void vblendps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0C, true, 0); db(imm); }
+void vblendps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0C, true, 0); db(imm); }
+void vdppd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x41, false, 0); db(imm); }
+void vdppd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x41, false, 0); db(imm); }
+void vdpps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x40, true, 0); db(imm); }
+void vdpps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x40, true, 0); db(imm); }
+void vmpsadbw(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x42, false, 0); db(imm); }
+void vmpsadbw(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x42, false, 0); db(imm); }
+void vpblendw(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0E, false, 0); db(imm); }
+void vpblendw(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0E, false, 0); db(imm); }
+void vroundsd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0B, false, 0); db(imm); }
+void vroundsd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0B, false, 0); db(imm); }
+void vroundss(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0A, false, 0); db(imm); }
+void vroundss(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0A, false, 0); db(imm); }
+void vpclmulqdq(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x44, false, 0); db(imm); }
+void vpclmulqdq(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x44, false, 0); db(imm); }
+void vpermilps(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0C, true, 0); }
+void vpermilpd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0D, true, 0); }
+void vcmppd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xC2, true, -1); db(imm); }
+void vcmppd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xC2, true, -1); db(imm); }
+void vcmpps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0xC2, true, -1); db(imm); }
+void vcmpps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0xC2, true, -1); db(imm); }
+void vcmpsd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F2, 0xC2, false, -1); db(imm); }
+void vcmpsd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0xC2, false, -1); db(imm); }
+void vcmpss(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0xC2, false, -1); db(imm); }
+void vcmpss(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0xC2, false, -1); db(imm); }
+void vcvtsd2ss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F2, 0x5A, false, -1); }
+void vcvtsd2ss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0x5A, false, -1); }
+void vcvtss2sd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x5A, false, -1); }
+void vcvtss2sd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x5A, false, -1); }
+void vinsertps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x21, false, 0); db(imm); }
+void vinsertps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x21, false, 0); db(imm); }
+void vpacksswb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x63, false, -1); }
+void vpacksswb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x63, false, -1); }
+void vpackssdw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6B, false, -1); }
+void vpackssdw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6B, false, -1); }
+void vpackuswb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x67, false, -1); }
+void vpackuswb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x67, false, -1); }
+void vpackusdw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x2B, false, -1); }
+void vpackusdw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x2B, false, -1); }
+void vpaddb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFC, false, -1); }
+void vpaddb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFC, false, -1); }
+void vpaddw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFD, false, -1); }
+void vpaddw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFD, false, -1); }
+void vpaddd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFE, false, -1); }
+void vpaddd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFE, false, -1); }
+void vpaddq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD4, false, -1); }
+void vpaddq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD4, false, -1); }
+void vpaddsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEC, false, -1); }
+void vpaddsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEC, false, -1); }
+void vpaddsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xED, false, -1); }
+void vpaddsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xED, false, -1); }
+void vpaddusb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDC, false, -1); }
+void vpaddusb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDC, false, -1); }
+void vpaddusw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDD, false, -1); }
+void vpaddusw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDD, false, -1); }
+void vpalignr(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0F, false, -1); db(imm); }
+void vpalignr(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0F, false, -1); db(imm); }
+void vpand(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDB, false, -1); }
+void vpand(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDB, false, -1); }
+void vpandn(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDF, false, -1); }
+void vpandn(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDF, false, -1); }
+void vpavgb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE0, false, -1); }
+void vpavgb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE0, false, -1); }
+void vpavgw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE3, false, -1); }
+void vpavgw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE3, false, -1); }
+void vpcmpeqb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x74, false, -1); }
+void vpcmpeqb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x74, false, -1); }
+void vpcmpeqw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x75, false, -1); }
+void vpcmpeqw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x75, false, -1); }
+void vpcmpeqd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x76, false, -1); }
+void vpcmpeqd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x76, false, -1); }
+void vpcmpeqq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x29, false, -1); }
+void vpcmpeqq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x29, false, -1); }
+void vpcmpgtb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x64, false, -1); }
+void vpcmpgtb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x64, false, -1); }
+void vpcmpgtw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x65, false, -1); }
+void vpcmpgtw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x65, false, -1); }
+void vpcmpgtd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x66, false, -1); }
+void vpcmpgtd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x66, false, -1); }
+void vpcmpgtq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x37, false, -1); }
+void vpcmpgtq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x37, false, -1); }
+void vphaddw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x01, false, -1); }
+void vphaddw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x01, false, -1); }
+void vphaddd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x02, false, -1); }
+void vphaddd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x02, false, -1); }
+void vphaddsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x03, false, -1); }
+void vphaddsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x03, false, -1); }
+void vphsubw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x05, false, -1); }
+void vphsubw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x05, false, -1); }
+void vphsubd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x06, false, -1); }
+void vphsubd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x06, false, -1); }
+void vphsubsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x07, false, -1); }
+void vphsubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x07, false, -1); }
+void vpmaddwd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF5, false, -1); }
+void vpmaddwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF5, false, -1); }
+void vpmaddubsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x04, false, -1); }
+void vpmaddubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x04, false, -1); }
+void vpmaxsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3C, false, -1); }
+void vpmaxsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3C, false, -1); }
+void vpmaxsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEE, false, -1); }
+void vpmaxsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEE, false, -1); }
+void vpmaxsd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3D, false, -1); }
+void vpmaxsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3D, false, -1); }
+void vpmaxub(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDE, false, -1); }
+void vpmaxub(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDE, false, -1); }
+void vpmaxuw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3E, false, -1); }
+void vpmaxuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3E, false, -1); }
+void vpmaxud(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3F, false, -1); }
+void vpmaxud(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3F, false, -1); }
+void vpminsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x38, false, -1); }
+void vpminsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x38, false, -1); }
+void vpminsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEA, false, -1); }
+void vpminsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEA, false, -1); }
+void vpminsd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x39, false, -1); }
+void vpminsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x39, false, -1); }
+void vpminub(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDA, false, -1); }
+void vpminub(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDA, false, -1); }
+void vpminuw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3A, false, -1); }
+void vpminuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3A, false, -1); }
+void vpminud(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3B, false, -1); }
+void vpminud(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3B, false, -1); }
+void vpmulhuw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE4, false, -1); }
+void vpmulhuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE4, false, -1); }
+void vpmulhrsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0B, false, -1); }
+void vpmulhrsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0B, false, -1); }
+void vpmulhw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE5, false, -1); }
+void vpmulhw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE5, false, -1); }
+void vpmullw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD5, false, -1); }
+void vpmullw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD5, false, -1); }
+void vpmulld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x40, false, -1); }
+void vpmulld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x40, false, -1); }
+void vpmuludq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF4, false, -1); }
+void vpmuludq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF4, false, -1); }
+void vpmuldq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x28, false, -1); }
+void vpmuldq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x28, false, -1); }
+void vpor(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEB, false, -1); }
+void vpor(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEB, false, -1); }
+void vpsadbw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF6, false, -1); }
+void vpsadbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF6, false, -1); }
+void vpshufb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x00, false, -1); }
+void vpsignb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x08, false, -1); }
+void vpsignb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x08, false, -1); }
+void vpsignw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x09, false, -1); }
+void vpsignw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x09, false, -1); }
+void vpsignd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0A, false, -1); }
+void vpsignd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0A, false, -1); }
+void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, false, -1); }
+void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, false, -1); }
+void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, false, -1); }
+void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, false, -1); }
+void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, false, -1); }
+void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, false, -1); }
+void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, false, -1); }
+void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, false, -1); }
+void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, false, -1); }
+void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, false, -1); }
+void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, false, -1); }
+void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, false, -1); }
+void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, false, -1); }
+void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, false, -1); }
+void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, false, -1); }
+void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, false, -1); }
+void vpsubb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF8, false, -1); }
+void vpsubb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF8, false, -1); }
+void vpsubw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF9, false, -1); }
+void vpsubw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF9, false, -1); }
+void vpsubd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFA, false, -1); }
+void vpsubd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFA, false, -1); }
+void vpsubq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFB, false, -1); }
+void vpsubq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFB, false, -1); }
+void vpsubsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE8, false, -1); }
+void vpsubsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE8, false, -1); }
+void vpsubsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE9, false, -1); }
+void vpsubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE9, false, -1); }
+void vpsubusb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD8, false, -1); }
+void vpsubusb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD8, false, -1); }
+void vpsubusw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD9, false, -1); }
+void vpsubusw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD9, false, -1); }
+void vpunpckhbw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x68, false, -1); }
+void vpunpckhbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x68, false, -1); }
+void vpunpckhwd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x69, false, -1); }
+void vpunpckhwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x69, false, -1); }
+void vpunpckhdq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6A, false, -1); }
+void vpunpckhdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6A, false, -1); }
+void vpunpckhqdq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6D, false, -1); }
+void vpunpckhqdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6D, false, -1); }
+void vpunpcklbw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x60, false, -1); }
+void vpunpcklbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x60, false, -1); }
+void vpunpcklwd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x61, false, -1); }
+void vpunpcklwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x61, false, -1); }
+void vpunpckldq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x62, false, -1); }
+void vpunpckldq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x62, false, -1); }
+void vpunpcklqdq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6C, false, -1); }
+void vpunpcklqdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6C, false, -1); }
+void vpxor(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEF, false, -1); }
+void vpxor(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEF, false, -1); }
+void vrcpss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x53, false, -1); }
+void vrcpss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x53, false, -1); }
+void vrsqrtss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x52, false, -1); }
+void vrsqrtss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x52, false, -1); }
+void vshufpd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xC6, true, -1); db(imm); }
+void vshufpd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xC6, true, -1); db(imm); }
+void vshufps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0xC6, true, -1); db(imm); }
+void vshufps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0xC6, true, -1); db(imm); }
+void vsqrtsd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F2, 0x51, false, -1); }
+void vsqrtsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0x51, false, -1); }
+void vsqrtss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x51, false, -1); }
+void vsqrtss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x51, false, -1); }
+void vunpckhpd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x15, true, -1); }
+void vunpckhpd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x15, true, -1); }
+void vunpckhps(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0x15, true, -1); }
+void vunpckhps(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0x15, true, -1); }
+void vunpcklpd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x14, true, -1); }
+void vunpcklpd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x14, true, -1); }
+void vunpcklps(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0x14, true, -1); }
+void vunpcklps(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0x14, true, -1); }
+void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0xDF, false, 0, imm); }
+void vroundpd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x09, true, 0, imm); }
+void vroundps(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x08, true, 0, imm); }
+void vpermilpd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x05, true, 0, imm); }
+void vpermilps(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x04, true, 0, imm); }
+void vpcmpestri(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x61, false, 0, imm); }
+void vpcmpestrm(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x60, false, 0, imm); }
+void vpcmpistri(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x63, false, 0, imm); }
+void vpcmpistrm(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x62, false, 0, imm); }
+void vtestps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x0E, true, 0); }
+void vtestpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x0F, true, 0); }
+void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x2F, false, -1); }
+void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x2F, false, -1); }
+void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x5B, true, -1); }
+void vcvtps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x5B, true, -1); }
+void vcvttps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x5B, true, -1); }
+void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x28, true, -1); }
+void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x28, true, -1); }
+void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F2, 0x12, true, -1); }
+void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x6F, true, -1); }
+void vmovdqu(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x6F, true, -1); }
+void vmovshdup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x16, true, -1); }
+void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x12, true, -1); }
+void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x10, true, -1); }
+void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x10, true, -1); }
+void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1C, false, -1); }
+void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1D, false, -1); }
+void vpabsd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1E, false, -1); }
+void vphminposuw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x41, false, -1); }
+void vpmovsxbw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x20, false, -1); }
+void vpmovsxbd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x21, false, -1); }
+void vpmovsxbq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x22, false, -1); }
+void vpmovsxwd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x23, false, -1); }
+void vpmovsxwq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x24, false, -1); }
+void vpmovsxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x25, false, -1); }
+void vpmovzxbw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x30, false, -1); }
+void vpmovzxbd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x31, false, -1); }
+void vpmovzxbq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x32, false, -1); }
+void vpmovzxwd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x33, false, -1); }
+void vpmovzxwq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x34, false, -1); }
+void vpmovzxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x35, false, -1); }
+void vpshufd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x70, false, -1, imm); }
+void vpshufhw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x70, false, -1, imm); }
+void vpshuflw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F2, 0x70, false, -1, imm); }
+void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x17, false, -1); }
+void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x53, true, -1); }
+void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x52, true, -1); }
+void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x51, true, -1); }
+void vsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x51, true, -1); }
+void vucomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x2E, false, -1); }
+void vucomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x2E, false, -1); }
+void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x29, true, -1); }
+void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F, 0x29, true, -1); }
+void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x7F, true, -1); }
+void vmovdqu(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_F3, 0x7F, true, -1); }
+void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x11, true, -1); }
+void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F, 0x11, true, -1); }
+void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0xD0, true, -1); }
+void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0xD0, true, -1); }
+void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x7C, true, -1); }
+void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x7C, true, -1); }
+void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x7D, true, -1); }
+void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x7D, true, -1); }
+void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDC, false, 0); }
+void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDD, false, 0); }
+void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDE, false, 0); }
+void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDF, false, 0); }
+void vmaskmovps(const Xmm& xm1, const Xmm& xm2, const Address& addr) { opAVX_X_X_XM(xm1, xm2, addr, MM_0F38 | PP_66, 0x2C, true, 0); }
+void vmaskmovps(const Address& addr, const Xmm& xm1, const Xmm& xm2) { opAVX_X_X_XM(xm2, xm1, addr, MM_0F38 | PP_66, 0x2E, true, 0); }
+void vmaskmovpd(const Xmm& xm1, const Xmm& xm2, const Address& addr) { opAVX_X_X_XM(xm1, xm2, addr, MM_0F38 | PP_66, 0x2D, true, 0); }
+void vmaskmovpd(const Address& addr, const Xmm& xm1, const Xmm& xm2) { opAVX_X_X_XM(xm2, xm1, addr, MM_0F38 | PP_66, 0x2F, true, 0); }
+void vmovhpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F | PP_66, 0x16, false); }
+void vmovhpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x17, false); }
+void vmovhps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F, 0x16, false); }
+void vmovhps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F, 0x17, false); }
+void vmovlpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F | PP_66, 0x12, false); }
+void vmovlpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x13, false); }
+void vmovlps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F, 0x12, false); }
+void vmovlps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F, 0x13, false); }
+void vfmadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x98, true, 1); }
+void vfmadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA8, true, 1); }
+void vfmadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB8, true, 1); }
+void vfmadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x98, true, 0); }
+void vfmadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA8, true, 0); }
+void vfmadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB8, true, 0); }
+void vfmadd132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x99, false, 1); }
+void vfmadd213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA9, false, 1); }
+void vfmadd231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB9, false, 1); }
+void vfmadd132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x99, false, 0); }
+void vfmadd213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA9, false, 0); }
+void vfmadd231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB9, false, 0); }
+void vfmaddsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x96, true, 1); }
+void vfmaddsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA6, true, 1); }
+void vfmaddsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB6, true, 1); }
+void vfmaddsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x96, true, 0); }
+void vfmaddsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA6, true, 0); }
+void vfmaddsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB6, true, 0); }
+void vfmsubadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x97, true, 1); }
+void vfmsubadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA7, true, 1); }
+void vfmsubadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB7, true, 1); }
+void vfmsubadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x97, true, 0); }
+void vfmsubadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA7, true, 0); }
+void vfmsubadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB7, true, 0); }
+void vfmsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9A, true, 1); }
+void vfmsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAA, true, 1); }
+void vfmsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBA, true, 1); }
+void vfmsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9A, true, 0); }
+void vfmsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAA, true, 0); }
+void vfmsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBA, true, 0); }
+void vfmsub132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9B, false, 1); }
+void vfmsub213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAB, false, 1); }
+void vfmsub231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBB, false, 1); }
+void vfmsub132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9B, false, 0); }
+void vfmsub213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAB, false, 0); }
+void vfmsub231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBB, false, 0); }
+void vfnmadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9C, true, 1); }
+void vfnmadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAC, true, 1); }
+void vfnmadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBC, true, 1); }
+void vfnmadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9C, true, 0); }
+void vfnmadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAC, true, 0); }
+void vfnmadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBC, true, 0); }
+void vfnmadd132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9D, false, 1); }
+void vfnmadd213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAD, false, 1); }
+void vfnmadd231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBD, false, 1); }
+void vfnmadd132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9D, false, 0); }
+void vfnmadd213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAD, false, 0); }
+void vfnmadd231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBD, false, 0); }
+void vfnmsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9E, true, 1); }
+void vfnmsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAE, true, 1); }
+void vfnmsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBE, true, 1); }
+void vfnmsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9E, true, 0); }
+void vfnmsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAE, true, 0); }
+void vfnmsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBE, true, 0); }
+void vfnmsub132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9F, false, 1); }
+void vfnmsub213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAF, false, 1); }
+void vfnmsub231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBF, false, 1); }
+void vfnmsub132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9F, false, 0); }
+void vfnmsub213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAF, false, 0); }
+void vfnmsub231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBF, false, 0); }
+void vaesimc(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0xDB, false, 0); }
+void vbroadcastf128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, MM_0F38 | PP_66, 0x1A, true, 0); }
+void vbroadcastsd(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, MM_0F38 | PP_66, 0x19, true, 0); }
+void vbroadcastss(const Xmm& x, const Address& addr) { opAVX_X_XM_IMM(x, addr, MM_0F38 | PP_66, 0x18, true, 0); }
+void vextractf128(const Operand& op, const Ymm& y, uint8 imm) { opAVX_X_XM_IMM(y, cvtReg(op, op.isXMM(), Operand::YMM), MM_0F3A | PP_66, 0x19, true, 0, imm); }
+void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, cvtReg(op, op.isXMM(), Operand::YMM), MM_0F3A | PP_66, 0x18, true, 0); db(imm); }
+void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, MM_0F3A | PP_66, 0x06, true, 0); db(imm); }
+void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_F2, 0xF0, true, 0); }
+void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, MM_0F, 0xAE, false, -1); }
+void vstmxcsr(const Address& addr) { opAVX_X_X_XM(xm3, xm0, addr, MM_0F, 0xAE, false, -1); }
+void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, MM_0F | PP_66, 0xF7, false, -1); }
+void vpextrb(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(i32e) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, xm0, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x14, false); db(imm); }
+void vpextrw(const Reg& r, const Xmm& x, uint8 imm) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), MM_0F | PP_66, 0xC5, false); db(imm); }
+void vpextrw(const Address& addr, const Xmm& x, uint8 imm) { opAVX_X_X_XM(x, xm0, addr, MM_0F3A | PP_66, 0x15, false); db(imm); }
+void vpextrd(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, xm0, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x16, false, 0); db(imm); }
+void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x20, false); db(imm); }
+void vpinsrb(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x20, false); db(imm); }
+void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F | PP_66, 0xC4, false); db(imm); }
+void vpinsrw(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F | PP_66, 0xC4, false); db(imm); }
+void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 0); db(imm); }
+void vpinsrd(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 0); db(imm); }
+void vpmovmskb(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, x, MM_0F | PP_66, 0xD7, false); }
+void vpslldq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm7, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
+void vpslldq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm7, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
+void vpsrldq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm3, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
+void vpsrldq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm3, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
+void vpsllw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm6, x1, x2, MM_0F | PP_66, 0x71, false); db(imm); }
+void vpsllw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm6, x, x, MM_0F | PP_66, 0x71, false); db(imm); }
+void vpslld(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm6, x1, x2, MM_0F | PP_66, 0x72, false); db(imm); }
+void vpslld(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm6, x, x, MM_0F | PP_66, 0x72, false); db(imm); }
+void vpsllq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm6, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
+void vpsllq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm6, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
+void vpsraw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm4, x1, x2, MM_0F | PP_66, 0x71, false); db(imm); }
+void vpsraw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm4, x, x, MM_0F | PP_66, 0x71, false); db(imm); }
+void vpsrad(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm4, x1, x2, MM_0F | PP_66, 0x72, false); db(imm); }
+void vpsrad(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm4, x, x, MM_0F | PP_66, 0x72, false); db(imm); }
+void vpsrlw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm2, x1, x2, MM_0F | PP_66, 0x71, false); db(imm); }
+void vpsrlw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm2, x, x, MM_0F | PP_66, 0x71, false); db(imm); }
+void vpsrld(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm2, x1, x2, MM_0F | PP_66, 0x72, false); db(imm); }
+void vpsrld(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm2, x, x, MM_0F | PP_66, 0x72, false); db(imm); }
+void vpsrlq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm2, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
+void vpsrlq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm2, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
+void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
+void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
+void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
+void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
+void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
+void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
+void vmovd(const Xmm& x, const Reg32& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 0); }
+void vmovd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x6E, false, 0); }
+void vmovd(const Reg32& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 0); }
+void vmovd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x7E, false, 0); }
+void vmovhlps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F, 0x12, false); }
+void vmovlhps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F, 0x16, false); }
+void vmovmskpd(const Reg& r, const Xmm& x) { if (!r.isBit(i32e)) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), x.isXMM() ? xm0 : ym0, x, MM_0F | PP_66, 0x50, true, 0); }
+void vmovmskps(const Reg& r, const Xmm& x) { if (!r.isBit(i32e)) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), x.isXMM() ? xm0 : ym0, x, MM_0F, 0x50, true, 0); }
+void vmovntdq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_66, 0xE7, true); }
+void vmovntpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_66, 0x2B, true); }
+void vmovntps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F, 0x2B, true); }
+void vmovntdqa(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F38 | PP_66, 0x2A, false); }
+void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F2, 0x10, false); }
+void vmovsd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F2, 0x10, false); }
+void vmovsd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F2, 0x11, false); }
+void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F3, 0x10, false); }
+void vmovss(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x10, false); }
+void vmovss(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x11, false); }
+void vcvtss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2D, false, 0); }
+void vcvttss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 0); }
+void vcvtsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 0); }
+void vcvttsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 0); }
+void vcvtsi2ss(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !(op2.isREG(i32e) || op2.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, cvtReg(op2, op2.isREG(), Operand::XMM), MM_0F | PP_F3, 0x2A, false, (op1.isMEM() || op2.isMEM()) ? -1 : (op1.isREG(32) || op2.isREG(32)) ? 0 : 1); }
+void vcvtsi2sd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !(op2.isREG(i32e) || op2.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, cvtReg(op2, op2.isREG(), Operand::XMM), MM_0F | PP_F2, 0x2A, false, (op1.isMEM() || op2.isMEM()) ? -1 : (op1.isREG(32) || op2.isREG(32)) ? 0 : 1); }
+void vcvtps2pd(const Xmm& x, const Operand& op) { if (!op.isMEM() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, cvtReg(op, !op.isMEM(), x.isXMM() ? Operand::XMM : Operand::YMM), MM_0F, 0x5A, true); }
+void vcvtdq2pd(const Xmm& x, const Operand& op) { if (!op.isMEM() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, cvtReg(op, !op.isMEM(), x.isXMM() ? Operand::XMM : Operand::YMM), MM_0F | PP_F3, 0xE6, true); }
+void vcvtpd2ps(const Xmm& x, const Operand& op) { if (x.isYMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_66, 0x5A, true); }
+void vcvtpd2dq(const Xmm& x, const Operand& op) { if (x.isYMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_F2, 0xE6, true); }
+void vcvttpd2dq(const Xmm& x, const Operand& op) { if (x.isYMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_66, 0xE6, true); }
+#ifdef XBYAK64
+void vmovq(const Xmm& x, const Reg64& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 1); }
+void vmovq(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x7E, false, -1); }
+void vmovq(const Reg64& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 1); }
+void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0xD6, false, -1); }
+void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, MM_0F | PP_F3, 0x7E, false, -1); }
+void vpextrq(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, xm0, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x16, false, 1); db(imm); }
+void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 1); db(imm); }
+void vpinsrq(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 1); db(imm); }
+void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2D, false, 1); }
+void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 1); }
+void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 1); }
+void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 1); }
+#endif
diff --git a/plugins/GSdx/xbyak/xbyak_util.h b/plugins/GSdx/xbyak/xbyak_util.h
index ba091687d0..18aacb34ce 100644
--- a/plugins/GSdx/xbyak/xbyak_util.h
+++ b/plugins/GSdx/xbyak/xbyak_util.h
@@ -2,9 +2,10 @@
 #define XBYAK_XBYAK_UTIL_H_
 
 /**
-	utility class for Xbyak
-	@note this header is under construction
+	utility class and functions for Xbyak
+	@note this header is UNDER CONSTRUCTION!
 */
+#include "xbyak/xbyak.h"
 
 #ifdef _WIN32
 	#if (_MSC_VER < 1400) && defined(XBYAK32)
@@ -29,10 +30,17 @@
 		#include <intrin.h> // for __cpuid
 	#endif
 #else
-	#if __GNUC_PREREQ(4, 3)
+	#ifndef __GNUC_PREREQ
+    	#define __GNUC_PREREQ(major, minor) (((major) << 16) + (minor))
+	#endif
+	#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
 		#include <cpuid.h>
 	#else
-		#define __cpuid(eaxIn, a, b, c, d) __asm__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
+		#if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
+			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
+		#else
+			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
+		#endif
 	#endif
 #endif
 
@@ -43,6 +51,10 @@ namespace Xbyak { namespace util {
 */
 class Cpu {
 	unsigned int type_;
+	unsigned int get32bitAsBE(const char *x) const
+	{
+		return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
+	}
 public:
 	static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
 	{
@@ -64,11 +76,17 @@ public:
 		tSSE41 = 1 << 7,
 		tSSE42 = 1 << 8,
 		tPOPCNT = 1 << 9,
+		tAESNI = 1 << 10,
+		tSSE5 = 1 << 11,
+		tOSXSACE = 1 << 12,
+		tPCLMULQDQ = 1 << 13,
+		tAVX = 1 << 14,
+		tFMA = 1 << 15,
 
 		t3DN = 1 << 16,
 		tE3DN = 1 << 17,
 		tSSE4a = 1 << 18,
-		tSSE5 = 1 << 11,
+		tRDTSCP = 1 << 19,
 
 		tINTEL = 1 << 24,
 		tAMD = 1 << 25
@@ -80,28 +98,39 @@ public:
 		getCpuid(0, data);
 		static const char intel[] = "ntel";
 		static const char amd[] = "cAMD";
-		if (data[2] == *reinterpret_cast<const unsigned int*>(amd)) {
+		if (data[2] == get32bitAsBE(amd)) {
 			type_ |= tAMD;
 			getCpuid(0x80000001, data);
-			if (data[3] & (1 << 31)) type_ |= t3DN;
-			if (data[3] & (1 << 15)) type_ |= tCMOV;
-			if (data[3] & (1 << 30)) type_ |= tE3DN;
-			if (data[3] & (1 << 22)) type_ |= tMMX2;
+			if (data[3] & (1U << 31)) type_ |= t3DN;
+			if (data[3] & (1U << 15)) type_ |= tCMOV;
+			if (data[3] & (1U << 30)) type_ |= tE3DN;
+			if (data[3] & (1U << 22)) type_ |= tMMX2;
+			if (data[3] & (1U << 27)) type_ |= tRDTSCP;
 		}
-		if (data[2] == *reinterpret_cast<const unsigned int*>(intel)) {
+		if (data[2] == get32bitAsBE(intel)) {
 			type_ |= tINTEL;
+			getCpuid(0x80000001, data);
+			if (data[3] & (1U << 27)) type_ |= tRDTSCP;
 		}
 		getCpuid(1, data);
-		if (data[2] & (1 << 0)) type_ |= tSSE3;
-		if (data[2] & (1 << 9)) type_ |= tSSSE3;
-		if (data[2] & (1 << 19)) type_ |= tSSE41;
-		if (data[2] & (1 << 20)) type_ |= tSSE42;
-		if (data[2] & (1 << 23)) type_ |= tPOPCNT;
-
-		if (data[3] & (1 << 15)) type_ |= tCMOV;
-		if (data[3] & (1 << 23)) type_ |= tMMX;
-		if (data[3] & (1 << 25)) type_ |= tMMX2 | tSSE;
-		if (data[3] & (1 << 26)) type_ |= tSSE2;
+		if (data[2] & (1U << 0)) type_ |= tSSE3;
+		if (data[2] & (1U << 9)) type_ |= tSSSE3;
+		if (data[2] & (1U << 19)) type_ |= tSSE41;
+		if (data[2] & (1U << 20)) type_ |= tSSE42;
+		if (data[2] & (1U << 23)) type_ |= tPOPCNT;
+		if (data[2] & (1U << 25)) type_ |= tAESNI;
+		if (data[2] & (1U << 1)) type_ |= tPCLMULQDQ;
+		if (data[2] & (1U << 27)) type_ |= tOSXSACE;
+#if _M_SSE >= 0x500
+		// QQQ
+		// should check XFEATURE_ENABLED_MASK[2:1] = '11b' by xgetvb
+		if (data[2] & (1U << 28)) type_ |= tAVX;
+		if (data[2] & (1U << 12)) type_ |= tFMA;
+#endif
+		if (data[3] & (1U << 15)) type_ |= tCMOV;
+		if (data[3] & (1U << 23)) type_ |= tMMX;
+		if (data[3] & (1U << 25)) type_ |= tMMX2 | tSSE;
+		if (data[3] & (1U << 26)) type_ |= tSSE2;
 	}
 	bool has(Type type) const
 	{
@@ -109,6 +138,40 @@ public:
 	}
 };
 
+class Clock {
+public:
+	static inline uint64 getRdtsc()
+	{
+#ifdef _MSC_VER
+		return __rdtsc();
+#else
+		unsigned int eax, edx;
+		__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
+		return ((uint64)edx << 32) | eax;
+#endif
+	}
+	Clock()
+		: clock_(0)
+		, count_(0)
+	{
+	}
+	void begin()
+	{
+		clock_ -= getRdtsc();
+	}
+	void end()
+	{
+		clock_ += getRdtsc();
+		count_++;
+	}
+	int getCount() const { return count_; }
+	uint64 getClock() const { return clock_; }
+	void clear() { count_ = 0; clock_ = 0; }
+private:
+	uint64 clock_;
+	int count_;
+};
+
 #ifdef XBYAK32
 
 namespace local {
@@ -133,53 +196,47 @@ XBYAK_LOCAL_DEFINE_SET_EIP_TO_REG(ebp)
 #undef XBYAK_LOCAL_DEFINE_SET_EIP_TO_REG
 } // end of local
 
-template<class Gen>
-struct EnableSetEip : public Gen {
-	EnableSetEip(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = 0)
-		: Gen(maxSize, userPtr)
-	{
-	}
-	/**
-		get pid to out register
-		@note out = eax or ecx or edx
-	*/
-	void setEipTo(const Xbyak::Reg32& out)
-	{
+/**
+	get eip to out register
+	@note out is not esp
+*/
+template<class T>
+void setEipTo(T *self, const Xbyak::Reg32& out)
+{
 #if 0
-		Gen::call(Gen::getCurr() + 5);
-		Gen::pop(out);
+	self->call("@f");
+self->L("@@");
+	self->pop(out);
 #else
-		int idx = out.getIdx();
-		switch (idx) {
-		case Xbyak::Operand::EAX:
-			Gen::call((void*)local::set_eip_to_eax);
-			break;
-		case Xbyak::Operand::ECX:
-			Gen::call((void*)local::set_eip_to_ecx);
-			break;
-		case Xbyak::Operand::EDX:
-			Gen::call((void*)local::set_eip_to_edx);
-			break;
-		case Xbyak::Operand::EBX:
-			Gen::call((void*)local::set_eip_to_ebx);
-			break;
-		case Xbyak::Operand::ESI:
-			Gen::call((void*)local::set_eip_to_esi);
-			break;
-		case Xbyak::Operand::EDI:
-			Gen::call((void*)local::set_eip_to_edi);
-			break;
-		case Xbyak::Operand::EBP:
-			Gen::call((void*)local::set_eip_to_ebp);
-			break;
-		default:
-			assert(0);
-		}
-#endif
+	int idx = out.getIdx();
+	switch (idx) {
+	case Xbyak::Operand::EAX:
+		self->call((void*)local::set_eip_to_eax);
+		break;
+	case Xbyak::Operand::ECX:
+		self->call((void*)local::set_eip_to_ecx);
+		break;
+	case Xbyak::Operand::EDX:
+		self->call((void*)local::set_eip_to_edx);
+		break;
+	case Xbyak::Operand::EBX:
+		self->call((void*)local::set_eip_to_ebx);
+		break;
+	case Xbyak::Operand::ESI:
+		self->call((void*)local::set_eip_to_esi);
+		break;
+	case Xbyak::Operand::EDI:
+		self->call((void*)local::set_eip_to_edi);
+		break;
+	case Xbyak::Operand::EBP:
+		self->call((void*)local::set_eip_to_ebp);
+		break;
+	default:
+		assert(0);
 	}
-};
+#endif
+}
 #endif
 
 } } // end of util
 #endif
-