Finished the emitter, complete with code cleanups! :) (added last few SSE instructions, and inserted placebos for some future additions to the x86 portion, regarding xchg/xadd/etc).

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1047 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-04-23 12:39:59 +00:00 · 2009-04-23 12:39:59 +00:00 · ef565303a5
parent ac0768e9a3
commit ef565303a5
18 changed files with 1072 additions and 974 deletions
--- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
+++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
@ -905,14 +905,6 @@
 		<Filter
 			Name="Misc"
 			>
-			<File
-				RelativePath="..\..\HashMap.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\HashTools.cpp"
-				>
-			</File>
 			<File
 				RelativePath="..\..\Dump.cpp"
 				>
@ -921,6 +913,14 @@
 				RelativePath="..\..\Dump.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\HashMap.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\HashTools.cpp"
+				>
+			</File>
 			<File
 				RelativePath="..\..\Misc.cpp"
 				>
@ -2965,10 +2965,6 @@
 				RelativePath="..\..\x86\ix86\ix86_legacy_internal.h"
 				>
 			</File>
-			<File
-				RelativePath="..\..\x86\ix86\ix86_legacy_mmx.cpp"
-				>
-			</File>
 			<File
 				RelativePath="..\..\x86\ix86\ix86_legacy_sse.cpp"
 				>
@ -2977,6 +2973,10 @@
 				RelativePath="..\..\x86\ix86\ix86_legacy_types.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\x86\ix86\ix86_simd.cpp"
+				>
+			</File>
 			<File
 				RelativePath="..\..\x86\ix86\ix86_sse_helpers.h"
 				>
@ -3028,6 +3028,10 @@
 					RelativePath="..\..\x86\ix86\implement\test.h"
 					>
 				</File>
+				<File
+					RelativePath="..\..\x86\ix86\implement\xchg.h"
+					>
+				</File>
 				<Filter
 					Name="xmm"
 					>
--- a/pcsx2/x86/iR3000A.cpp
+++ b/pcsx2/x86/iR3000A.cpp
@ -935,6 +935,8 @@ void psxRecompileNextInstruction(int delayslot)
 #ifdef _DEBUG
 static void printfn()
 {
+	extern void iDumpPsxRegisters(u32 startpc, u32 temp);
+
 	static int lastrec = 0;
 	static int curcount = 0;
 	const int skip = 0;
@ -962,6 +964,8 @@ void iopRecRecompile(u32 startpc)
 	u32 willbranch3 = 0;

 #ifdef _DEBUG
+	extern void iDumpPsxRegisters(u32 startpc, u32 temp);
+
 	if( psxdump & 4 )
 		iDumpPsxRegisters(startpc, 0);
 #endif
--- a/pcsx2/x86/ix86-32/iR5900Move.cpp
+++ b/pcsx2/x86/ix86-32/iR5900Move.cpp
@ -316,7 +316,7 @@ void recMFHILO1(int hi)

 	if( reghi >= 0 ) {
 		if( regd >= 0 ) {
-			SSEX_MOVHLPS_XMM_to_XMM(regd, reghi);
+			SSE_MOVHLPS_XMM_to_XMM(regd, reghi);
 			xmmregs[regd].mode |= MODE_WRITE;
 		}
 		else {
--- a/pcsx2/x86/ix86/implement/bittest.h
+++ b/pcsx2/x86/ix86/implement/bittest.h
@ -32,152 +32,39 @@ enum G8Type
 	G8Type_BTC,
 };

-//////////////////////////////////////////////////////////////////////////////////////////
-// Notes: Bit Test instructions are valid on 16/32 bit operands only.
-//
-template< G8Type InstType, typename ImmType >
-class Group8Impl
-{
-protected:
-	static const uint OperandSize = sizeof(ImmType);
-
-	static void prefix16()		{ if( OperandSize == 2 ) xWrite<u8>( 0x66 ); }
-
-public: 
-	Group8Impl() {}		// For the love of GCC.
-
-	// ------------------------------------------------------------------------
-	static __emitinline void Emit( const xRegister<ImmType>& bitbase, const xRegister<ImmType>& bitoffset )
-	{
-		prefix16();
-		xWrite<u8>( 0x0f );
-		xWrite<u8>( 0xa3 | (InstType << 2) );
-		ModRM_Direct( bitoffset.Id, bitbase.Id );
-	}
-
-	// ------------------------------------------------------------------------
-	static __emitinline void Emit( void* bitbase, const xRegister<ImmType>& bitoffset )
-	{
-		prefix16();
-		xWrite<u8>( 0x0f );
-		xWrite<u8>( 0xa3 | (InstType << 2) );
-		xWriteDisp( bitoffset.Id, bitbase );
-	}
-
-	// ------------------------------------------------------------------------
-	static __emitinline void Emit( const ModSibBase& bitbase, const xRegister<ImmType>& bitoffset )
-	{
-		prefix16();
-		xWrite<u8>( 0x0f );
-		xWrite<u8>( 0xa3 | (InstType << 2) );
-		EmitSibMagic( bitoffset.Id, bitbase );
-	}
-
-	// ------------------------------------------------------------------------
-	static __emitinline void Emit( const xRegister<ImmType>& bitbase, u8 immoffset )
-	{
-		prefix16();
-		xWrite<u16>( 0xba0f );
-		ModRM_Direct( InstType, bitbase.Id );
-		xWrite<u8>( immoffset );
-	}
-
-	// ------------------------------------------------------------------------
-	static __emitinline void Emit( const ModSibStrict<ImmType>& bitbase, u8 immoffset )
-	{
-		prefix16();
-		xWrite<u16>( 0xba0f );
-		EmitSibMagic( InstType, bitbase );
-		xWrite<u8>( immoffset );
-	}
-};
-
-// -------------------------------------------------------------------
-//
-template< G8Type InstType >
-class Group8ImplAll
-{
-protected:
-	typedef Group8Impl<InstType,u32> m_32;
-	typedef Group8Impl<InstType,u32> m_16;
-
-public:
-	__forceinline void operator()( const xRegister32& bitbase,	const xRegister32& bitoffset ) const	{ m_32::Emit( bitbase, bitoffset ); }
-	__forceinline void operator()( const xRegister16& bitbase,	const xRegister16& bitoffset ) const	{ m_16::Emit( bitbase, bitoffset ); }
-	__forceinline void operator()( void* bitbase,				const xRegister32& bitoffset ) const	{ m_32::Emit( bitbase, bitoffset ); }
-	__forceinline void operator()( void* bitbase,				const xRegister16& bitoffset ) const	{ m_16::Emit( bitbase, bitoffset ); }
-	__noinline void operator()( const ModSibBase& bitbase,		const xRegister32& bitoffset ) const	{ m_32::Emit( bitbase, bitoffset ); }
-	__noinline void operator()( const ModSibBase& bitbase,		const xRegister16& bitoffset ) const	{ m_16::Emit( bitbase, bitoffset ); }
-
-	__noinline void operator()( const ModSibStrict<u32>& bitbase, u8 bitoffset ) const	{ m_32::Emit( bitbase, bitoffset ); }
-	__noinline void operator()( const ModSibStrict<u16>& bitbase, u8 bitoffset ) const	{ m_16::Emit( bitbase, bitoffset ); }
-	void operator()( const xRegister<u32>& bitbase, u8 bitoffset ) const				{ m_32::Emit( bitbase, bitoffset ); }
-	void operator()( const xRegister<u16>& bitbase, u8 bitoffset ) const				{ m_16::Emit( bitbase, bitoffset ); }
-
-	Group8ImplAll() {}
-};
-
-
 //////////////////////////////////////////////////////////////////////////////////////////
 // BSF / BSR -- 16/32 operands supported only.
 //
-template< bool isReverse, typename ImmType >
+// 0xbc [fwd] / 0xbd [rev]
+//
+template< u16 Opcode >
 class BitScanImpl
 {
-protected:
-	static const uint OperandSize = sizeof(ImmType);
-	static void prefix16()		{ if( OperandSize == 2 ) xWrite<u8>( 0x66 ); }
-	static void emitbase()
-	{
-		prefix16();
-		xWrite<u8>( 0x0f );
-		xWrite<u8>( isReverse ? 0xbd : 0xbc );
-	}
-
 public:
-	BitScanImpl() {}		// For the love of GCC.
+	BitScanImpl() {}

-	// ------------------------------------------------------------------------
-	static __emitinline void Emit( const xRegister<ImmType>& to, const xRegister<ImmType>& from )
-	{
-		emitbase();
-		ModRM_Direct( to.Id, from.Id );
-	}
-
-	// ------------------------------------------------------------------------
-	static __emitinline void Emit( const xRegister<ImmType>& to, const void* src )
-	{
-		emitbase();
-		xWriteDisp( to.Id, src );
-	}
-
-	// ------------------------------------------------------------------------
-	static __emitinline void Emit( const xRegister<ImmType>& to, const ModSibBase& sibsrc )
-	{
-		emitbase();
-		EmitSibMagic( to.Id, sibsrc );
-	}
+	__forceinline void operator()( const xRegister32& to, const xRegister32& from ) const	{ xOpWrite0F( Opcode, to, from ); }
+	__forceinline void operator()( const xRegister16& to, const xRegister16& from ) const	{ xOpWrite0F( 0x66, Opcode, to, from ); }
+	__forceinline void operator()( const xRegister32& to, const void* src ) const			{ xOpWrite0F( Opcode, to, src ); }
+	__forceinline void operator()( const xRegister16& to, const void* src ) const			{ xOpWrite0F( 0x66, Opcode, to, src ); }
+	__forceinline void operator()( const xRegister32& to, const ModSibBase& sibsrc ) const	{ xOpWrite0F( Opcode, to, sibsrc ); }
+	__forceinline void operator()( const xRegister16& to, const ModSibBase& sibsrc ) const	{ xOpWrite0F( 0x66, Opcode, to, sibsrc ); }
 };

-
-// -------------------------------------------------------------------
-// BSF/BSR  -- 16 and 32 bit operand forms only!
+//////////////////////////////////////////////////////////////////////////////////////////
+// Bit Test Instructions - Valid on 16/32 bit instructions only.
 //
-template< bool isReverse >
-class BitScanImplAll
+template< G8Type InstType >
+class Group8Impl : public BitScanImpl<0xa3 | (InstType << 2)>
 {
-protected:
-	typedef BitScanImpl<isReverse,u32> m_32;
-	typedef BitScanImpl<isReverse,u32> m_16;
-
 public:
-	__forceinline void operator()( const xRegister32& to, const xRegister32& from ) const	{ m_32::Emit( to, from ); }
-	__forceinline void operator()( const xRegister16& to, const xRegister16& from ) const	{ m_16::Emit( to, from ); }
-	__forceinline void operator()( const xRegister32& to, const void* src ) const			{ m_32::Emit( to, src ); }
-	__forceinline void operator()( const xRegister16& to, const void* src ) const			{ m_16::Emit( to, src ); }
-	__noinline void operator()( const xRegister32& to, const ModSibBase& sibsrc ) const		{ m_32::Emit( to, sibsrc ); }
-	__noinline void operator()( const xRegister16& to, const ModSibBase& sibsrc ) const		{ m_16::Emit( to, sibsrc ); }
+	using BitScanImpl<0xa3 | (InstType << 2)>::operator();

-	BitScanImplAll() {}
+	__forceinline void operator()( const ModSibStrict<u32>& bitbase, u8 bitoffset ) const	{ xOpWrite0F( 0xba, InstType, bitbase );		xWrite<u8>( bitoffset ); }
+	__forceinline void operator()( const ModSibStrict<u16>& bitbase, u8 bitoffset ) const	{ xOpWrite0F( 0x66, 0xba, InstType, bitbase );	xWrite<u8>( bitoffset ); }
+	void operator()( const xRegister<u32>& bitbase, u8 bitoffset ) const					{ xOpWrite0F( 0xba, InstType, bitbase );		xWrite<u8>( bitoffset ); }
+	void operator()( const xRegister<u16>& bitbase, u8 bitoffset ) const					{ xOpWrite0F( 0x66, 0xba, InstType, bitbase );	xWrite<u8>( bitoffset ); }
+
+	Group8Impl() {}
 };

--- a/pcsx2/x86/ix86/implement/group1.h
+++ b/pcsx2/x86/ix86/implement/group1.h
@ -167,9 +167,9 @@ class xImpl_G1Compare : xImpl_Group1< G1Type_CMP >
 protected:
 	template< u8 Prefix > struct Woot
 	{
-		__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from, SSE2_ComparisonType cmptype ) const{ writeXMMop( Prefix, 0xc2, to, from ); xWrite<u8>( cmptype ); }
-		__forceinline void operator()( const xRegisterSSE& to, const void* from, SSE2_ComparisonType cmptype ) const		{ writeXMMop( Prefix, 0xc2, to, from ); xWrite<u8>( cmptype ); }
-		__noinline void operator()( const xRegisterSSE& to, const ModSibBase& from, SSE2_ComparisonType cmptype ) const		{ writeXMMop( Prefix, 0xc2, to, from ); xWrite<u8>( cmptype ); }
+		__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from, SSE2_ComparisonType cmptype ) const{ xOpWrite0F( Prefix, 0xc2, to, from ); xWrite<u8>( cmptype ); }
+		__forceinline void operator()( const xRegisterSSE& to, const void* from, SSE2_ComparisonType cmptype ) const		{ xOpWrite0F( Prefix, 0xc2, to, from ); xWrite<u8>( cmptype ); }
+		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from, SSE2_ComparisonType cmptype ) const	{ xOpWrite0F( Prefix, 0xc2, to, from ); xWrite<u8>( cmptype ); }
 		Woot() {}
 	};

--- a/pcsx2/x86/ix86/implement/xchg.h
+++ b/pcsx2/x86/ix86/implement/xchg.h
@ -0,0 +1,22 @@
+/*  Pcsx2 - Pc Ps2 Emulator
+ *  Copyright (C) 2002-2009  Pcsx2 Team
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *  
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
+#pragma once
+
+// This header file is intended to be the future home of xchg, cmpxchg, xadd, and
+// other threading-related exchange instructions.
--- a/pcsx2/x86/ix86/implement/xmm/arithmetic.h
+++ b/pcsx2/x86/ix86/implement/xmm/arithmetic.h
@ -28,19 +28,25 @@ class _SimdShiftHelper
 public:
 	_SimdShiftHelper() {}

-	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const { writeXMMop( 0x66, Opcode1, to, from ); }
-	__forceinline void operator()( const xRegisterSSE& to, const void* from ) const			{ writeXMMop( 0x66, Opcode1, to, from ); }
-	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ writeXMMop( 0x66, Opcode1, to, from ); }
+	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const { xOpWrite0F( 0x66, Opcode1, to, from ); }
+	__forceinline void operator()( const xRegisterSSE& to, const void* from ) const			{ xOpWrite0F( 0x66, Opcode1, to, from ); }
+	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ xOpWrite0F( 0x66, Opcode1, to, from ); }

-	__forceinline void operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const { writeXMMop( Opcode1, to, from ); }
-	__forceinline void operator()( const xRegisterMMX& to, const void* from ) const			{ writeXMMop( Opcode1, to, from ); }
-	__forceinline void operator()( const xRegisterMMX& to, const ModSibBase& from ) const	{ writeXMMop( Opcode1, to, from ); }
+	__forceinline void operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const { xOpWrite0F( Opcode1, to, from ); }
+	__forceinline void operator()( const xRegisterMMX& to, const void* from ) const			{ xOpWrite0F( Opcode1, to, from ); }
+	__forceinline void operator()( const xRegisterMMX& to, const ModSibBase& from ) const	{ xOpWrite0F( Opcode1, to, from ); }


-	template< typename OperandType >
-	__emitinline void operator()( const xRegisterSIMD<OperandType>& to, u8 imm8 ) const
+	__emitinline void operator()( const xRegisterSSE& to, u8 imm8 ) const
 	{
-		SimdPrefix( (sizeof( OperandType ) == 16) ? 0x66 : 0, OpcodeImm );
+		SimdPrefix( 0x66, OpcodeImm );
+		ModRM( 3, (int)Modcode, to.Id );
+		xWrite<u8>( imm8 );
+	}
+
+	__emitinline void operator()( const xRegisterMMX& to, u8 imm8 ) const
+	{
+		SimdPrefix( 0x00, OpcodeImm );
 		ModRM( 3, (int)Modcode, to.Id );
 		xWrite<u8>( imm8 );
 	}
@ -68,11 +74,11 @@ class SimdImpl_Shift : public SimdImpl_ShiftWithoutQ<OpcodeBase1, Modcode>
 public:
 	const _SimdShiftHelper<OpcodeBase1+3,0x73,Modcode> Q;
 	
-	void DQ( const xRegisterSSE& to, u8 imm ) const
+	void DQ( const xRegisterSSE& to, u8 imm8 ) const
 	{
 		SimdPrefix( 0x66, 0x73 );
 		ModRM( 3, (int)Modcode+1, to.Id );
-		xWrite<u8>( imm );
+		xWrite<u8>( imm8 );
 	}
 	
 	SimdImpl_Shift() {}
@ -156,8 +162,8 @@ template< u16 OpcodeSSE >
 class SimdImpl_Sqrt : public SimdImpl_rSqrt<OpcodeSSE>
 {
 public:
-	const SimdImpl_DestRegSSE<0xf2,OpcodeSSE> SD;
 	SimdImpl_Sqrt() {}
+	const SimdImpl_DestRegSSE<0xf2,OpcodeSSE> SD;
 };

 //////////////////////////////////////////////////////////////////////////////////////////
@ -165,9 +171,9 @@ public:
 class SimdImpl_AndNot
 {
 public:
+	SimdImpl_AndNot() {}
 	const SimdImpl_DestRegSSE<0x00,0x55> PS;
 	const SimdImpl_DestRegSSE<0x66,0x55> PD;
-	SimdImpl_AndNot() {}
 };

 //////////////////////////////////////////////////////////////////////////////////////////
@ -282,3 +288,87 @@ public:
 	//   *src* stores the result in the high quadword of dest.
 	const SimdImpl_DestRegSSE<0x66, 0x7c> PD;
 };
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// DotProduct calculation (SSE4.1 only!)
+//
+class SimdImpl_DotProduct
+{
+public:
+	SimdImpl_DotProduct() {}
+
+	// [SSE-4.1] Conditionally multiplies the packed single precision floating-point
+	// values in dest with the packed single-precision floats in src depending on a
+	// mask extracted from the high 4 bits of the immediate byte. If a condition mask
+	// bit in Imm8[7:4] is zero, the corresponding multiplication is replaced by a value
+	// of 0.0.	The four resulting single-precision values are summed into an inter-
+	// mediate result. 
+	//
+	// The intermediate result is conditionally broadcasted to the destination using a
+	// broadcast mask specified by bits [3:0] of the immediate byte. If a broadcast
+	// mask bit is 1, the intermediate result is copied to the corresponding dword
+	// element in dest.  If a broadcast mask bit is zero, the corresponding element in
+	// the destination is set to zero.
+	//
+	SimdImpl_DestRegImmSSE<0x66,0x403a> PS;
+
+	// [SSE-4.1]
+	SimdImpl_DestRegImmSSE<0x66,0x413a> PD;
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Rounds floating point values (packed or single scalar) by an arbitrary rounding mode.
+// (SSE4.1 only!)
+class SimdImpl_Round
+{
+public:
+	SimdImpl_Round() {}
+
+	// [SSE-4.1] Rounds the 4 packed single-precision src values and stores them in dest.
+	//
+	// Imm8 specifies control fields for the rounding operation:
+	//   Bit  3 - processor behavior for a precision exception (0: normal, 1: inexact)
+	//   Bit  2 - If enabled, use MXCSR.RC, else use RC specified in bits 1:0 of this Imm8.
+	//   Bits 1:0 - Specifies a rounding mode for this instruction only.
+	//
+	// Rounding Mode Reference:
+	//   0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
+	//
+	const SimdImpl_DestRegImmSSE<0x66,0x083a> PS;
+
+	// [SSE-4.1] Rounds the 2 packed double-precision src values and stores them in dest.
+	//
+	// Imm8 specifies control fields for the rounding operation:
+	//   Bit  3 - processor behavior for a precision exception (0: normal, 1: inexact)
+	//   Bit  2 - If enabled, use MXCSR.RC, else use RC specified in bits 1:0 of this Imm8.
+	//   Bits 1:0 - Specifies a rounding mode for this instruction only.
+	//
+	// Rounding Mode Reference:
+	//   0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
+	//
+	const SimdImpl_DestRegImmSSE<0x66,0x093a> PD;
+
+	// [SSE-4.1] Rounds the single-precision src value and stores in dest.
+	//
+	// Imm8 specifies control fields for the rounding operation:
+	//   Bit  3 - processor behavior for a precision exception (0: normal, 1: inexact)
+	//   Bit  2 - If enabled, use MXCSR.RC, else use RC specified in bits 1:0 of this Imm8.
+	//   Bits 1:0 - Specifies a rounding mode for this instruction only.
+	//
+	// Rounding Mode Reference:
+	//   0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
+	//
+	const SimdImpl_DestRegImmSSE<0x66,0x0a3a> SS;
+
+	// [SSE-4.1] Rounds the double-precision src value and stores in dest.
+	//
+	// Imm8 specifies control fields for the rounding operation:
+	//   Bit  3 - processor behavior for a precision exception (0: normal, 1: inexact)
+	//   Bit  2 - If enabled, use MXCSR.RC, else use RC specified in bits 1:0 of this Imm8.
+	//   Bits 1:0 - Specifies a rounding mode for this instruction only.
+	//
+	// Rounding Mode Reference:
+	//   0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
+	//
+	const SimdImpl_DestRegImmSSE<0x66,0x0b3a> SD;
+};
--- a/pcsx2/x86/ix86/implement/xmm/basehelpers.h
+++ b/pcsx2/x86/ix86/implement/xmm/basehelpers.h
@ -23,57 +23,106 @@

 extern void SimdPrefix( u8 prefix, u16 opcode );

-// ------------------------------------------------------------------------
-// xmm emitter helpers for xmm instruction with prefixes.
-// These functions also support deducing the use of the prefix from the template parameters,
-// since most xmm instructions use a prefix and most mmx instructions do not.  (some mov
-// instructions violate this "guideline.")
-//
-template< typename T, typename T2 >
-__emitinline void writeXMMop( u8 prefix, u16 opcode, const xRegister<T>& to, const xRegister<T2>& from, bool forcePrefix=false )
+extern void xOpWrite0F( u8 prefix, u16 opcode, int instId, const ModSibBase& sib );
+extern void xOpWrite0F( u8 prefix, u16 opcode, int instId, const void* data );
+extern void xOpWrite0F( u16 opcode, int instId, const ModSibBase& sib );
+extern void xOpWrite0F( u16 opcode, int instId, const void* data );
+
+template< typename T2 > __emitinline
+void xOpWrite0F( u8 prefix, u16 opcode, int instId, const xRegister<T2>& from )
 {
-	SimdPrefix( (forcePrefix || (sizeof( T ) == 16)) ? prefix : 0, opcode );
-	ModRM_Direct( to.Id, from.Id );
+	SimdPrefix( prefix, opcode );
+	ModRM_Direct( instId, from.Id );
 }

-template< typename T >
-__noinline void writeXMMop( u8 prefix, u16 opcode, const xRegister<T>& reg, const ModSibBase& sib, bool forcePrefix=false )
+template< typename T2 > __emitinline
+void xOpWrite0F( u16 opcode, int instId, const xRegister<T2>& from )
 {
-	SimdPrefix( (forcePrefix || (sizeof( T ) == 16)) ? prefix : 0, opcode );
-	EmitSibMagic( reg.Id, sib );
+	xOpWrite0F( 0, opcode, instId, from );
 }

-template< typename T >
-__emitinline void writeXMMop( u8 prefix, u16 opcode, const xRegister<T>& reg, const void* data, bool forcePrefix=false )
+template< typename T, typename T2 > __emitinline
+void xOpWrite0F( u8 prefix, u16 opcode, const xRegister<T>& to, const xRegister<T2>& from, bool forcePrefix=false )
 {
-	SimdPrefix( (forcePrefix || (sizeof( T ) == 16)) ? prefix : 0, opcode );
-	xWriteDisp( reg.Id, data );
+	xOpWrite0F( prefix, opcode, to.Id, from );
+}
+
+template< typename T > __noinline
+void xOpWrite0F( u8 prefix, u16 opcode, const xRegister<T>& reg, const ModSibBase& sib, bool forcePrefix=false )
+{
+	xOpWrite0F( prefix, opcode, reg.Id, sib );
+}
+
+template< typename T > __emitinline
+void xOpWrite0F( u8 prefix, u16 opcode, const xRegister<T>& reg, const void* data, bool forcePrefix=false )
+{
+	xOpWrite0F( prefix, opcode, reg.Id, data );
 }

 // ------------------------------------------------------------------------
-// xmm emitter helpers for xmm instructions *without* prefixes.
-// These are normally used for special instructions that have MMX forms only (non-SSE), however
-// some special forms of sse/xmm mov instructions also use them due to prefixing inconsistencies.
 //
-template< typename T, typename T2 >
-__emitinline void writeXMMop( u16 opcode, const xRegister<T>& to, const xRegister<T2>& from )
+template< typename T, typename T2 > __emitinline
+void xOpWrite0F( u8 prefix, u16 opcode, const xRegister<T>& to, const xRegister<T2>& from, u8 imm8 )
 {
-	SimdPrefix( 0, opcode );
-	ModRM_Direct( to.Id, from.Id );
+	xOpWrite0F( prefix, opcode, to, from );
+	xWrite<u8>( imm8 );
 }

-template< typename T >
-__noinline void writeXMMop( u16 opcode, const xRegister<T>& reg, const ModSibBase& sib )
+template< typename T > __noinline
+void xOpWrite0F( u8 prefix, u16 opcode, const xRegister<T>& reg, const ModSibBase& sib, u8 imm8 )
 {
-	SimdPrefix( 0, opcode );
-	EmitSibMagic( reg.Id, sib );
+	xOpWrite0F( prefix, opcode, reg, sib );
+	xWrite<u8>( imm8 );
 }

-template< typename T >
-__emitinline void writeXMMop( u16 opcode, const xRegister<T>& reg, const void* data )
+template< typename T > __emitinline
+void xOpWrite0F( u8 prefix, u16 opcode, const xRegister<T>& reg, const void* data, u8 imm8 )
 {
-	SimdPrefix( 0, opcode );
-	xWriteDisp( reg.Id, data );
+	xOpWrite0F( prefix, opcode, reg, data );
+	xWrite<u8>( imm8 );
+}
+
+// ------------------------------------------------------------------------
+
+template< typename T, typename T2 > __emitinline
+void xOpWrite0F( u16 opcode, const xRegister<T>& to, const xRegister<T2>& from )
+{
+	xOpWrite0F( 0, opcode, to, from );
+}
+
+template< typename T > __noinline
+void xOpWrite0F( u16 opcode, const xRegister<T>& reg, const ModSibBase& sib )
+{
+	xOpWrite0F( 0, opcode, reg, sib );
+}
+
+template< typename T > __emitinline
+void xOpWrite0F( u16 opcode, const xRegister<T>& reg, const void* data )
+{
+	xOpWrite0F( 0, opcode, reg, data );
+}
+
+// ------------------------------------------------------------------------
+
+template< typename T, typename T2 > __emitinline
+void xOpWrite0F( u16 opcode, const xRegister<T>& to, const xRegister<T2>& from, u8 imm8 )
+{
+	xOpWrite0F( opcode, to, from );
+	xWrite<u8>( imm8 );
+}
+
+template< typename T > __noinline
+void xOpWrite0F( u16 opcode, const xRegister<T>& reg, const ModSibBase& sib, u8 imm8 )
+{
+	xOpWrite0F( opcode, reg, sib );
+	xWrite<u8>( imm8 );
+}
+
+template< typename T > __emitinline
+void xOpWrite0F( u16 opcode, const xRegister<T>& reg, const void* data, u8 imm8 )
+{
+	xOpWrite0F( opcode, reg, data );
+	xWrite<u8>( imm8 );
 }

 // ------------------------------------------------------------------------
@ -84,9 +133,9 @@ template< u8 Prefix, u16 Opcode >
 class SimdImpl_DestRegSSE
 {
 public:
-	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const	{ writeXMMop( Prefix, Opcode, to, from ); }
-	__forceinline void operator()( const xRegisterSSE& to, const void* from ) const			{ writeXMMop( Prefix, Opcode, to, from ); }
-	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ writeXMMop( Prefix, Opcode, to, from ); }
+	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const	{ xOpWrite0F( Prefix, Opcode, to, from ); }
+	__forceinline void operator()( const xRegisterSSE& to, const void* from ) const			{ xOpWrite0F( Prefix, Opcode, to, from ); }
+	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ xOpWrite0F( Prefix, Opcode, to, from ); }

 	SimdImpl_DestRegSSE() {} //GCWho?
 };
@ -99,9 +148,9 @@ template< u8 Prefix, u16 Opcode >
 class SimdImpl_DestRegImmSSE
 {
 public:
-	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const	{ writeXMMop( Prefix, Opcode, to, from ); xWrite<u8>( imm ); }
-	__forceinline void operator()( const xRegisterSSE& to, const void* from, u8 imm ) const			{ writeXMMop( Prefix, Opcode, to, from ); xWrite<u8>( imm ); }
-	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm ) const	{ writeXMMop( Prefix, Opcode, to, from ); xWrite<u8>( imm ); }
+	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const	{ xOpWrite0F( Prefix, Opcode, to, from, imm ); }
+	__forceinline void operator()( const xRegisterSSE& to, const void* from, u8 imm ) const			{ xOpWrite0F( Prefix, Opcode, to, from, imm ); }
+	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm ) const	{ xOpWrite0F( Prefix, Opcode, to, from, imm ); }

 	SimdImpl_DestRegImmSSE() {} //GCWho?
 };
@ -110,9 +159,9 @@ template< u8 Prefix, u16 Opcode >
 class SimdImpl_DestRegImmMMX
 {
 public:
-	__forceinline void operator()( const xRegisterMMX& to, const xRegisterMMX& from, u8 imm ) const	{ writeXMMop( Prefix, Opcode, to, from ); xWrite<u8>( imm ); }
-	__forceinline void operator()( const xRegisterMMX& to, const void* from, u8 imm ) const			{ writeXMMop( Prefix, Opcode, to, from ); xWrite<u8>( imm ); }
-	__forceinline void operator()( const xRegisterMMX& to, const ModSibBase& from, u8 imm ) const	{ writeXMMop( Prefix, Opcode, to, from ); xWrite<u8>( imm ); }
+	__forceinline void operator()( const xRegisterMMX& to, const xRegisterMMX& from, u8 imm ) const	{ xOpWrite0F( Opcode, to, from, imm ); }
+	__forceinline void operator()( const xRegisterMMX& to, const void* from, u8 imm ) const			{ xOpWrite0F( Opcode, to, from, imm ); }
+	__forceinline void operator()( const xRegisterMMX& to, const ModSibBase& from, u8 imm ) const	{ xOpWrite0F( Opcode, to, from, imm ); }

 	SimdImpl_DestRegImmMMX() {} //GCWho?
 };
@ -125,27 +174,33 @@ template< u8 Prefix, u16 Opcode >
 class SimdImpl_DestRegEither
 {
 public:
-	template< typename T > __forceinline
-	void operator()( const xRegisterSIMD<T>& to, const xRegisterSIMD<T>& from ) const	{ writeXMMop( Prefix, Opcode, to, from ); }
-	template< typename T > __forceinline
-	void operator()( const xRegisterSIMD<T>& to, const void* from ) const				{ writeXMMop( Prefix, Opcode, to, from ); }
-	template< typename T > __forceinline
-	void operator()( const xRegisterSIMD<T>& to, const ModSibBase& from ) const			{ writeXMMop( Prefix, Opcode, to, from ); }
+	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const	{ xOpWrite0F( Prefix, Opcode, to, from ); }
+	__forceinline void operator()( const xRegisterSSE& to, const void* from ) const			{ xOpWrite0F( Prefix, Opcode, to, from ); }
+	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ xOpWrite0F( Prefix, Opcode, to, from ); }
+
+	__forceinline void operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const	{ xOpWrite0F( Opcode, to, from ); }
+	__forceinline void operator()( const xRegisterMMX& to, const void* from ) const			{ xOpWrite0F( Opcode, to, from ); }
+	__forceinline void operator()( const xRegisterMMX& to, const ModSibBase& from ) const	{ xOpWrite0F( Opcode, to, from ); }

 	SimdImpl_DestRegEither() {} //GCWho?
 };

 // ------------------------------------------------------------------------
-// For implementing MMX/SSE operations which the destination *must* be a register, but the source
-// can be regDirect or ModRM (indirect).
+// For implementing MMX/SSE operations where the destination *must* be a register, but the
+// source can be Direct or Indirect (ModRM/SibSB).  The SrcOperandType template parameter
+// is used to enforce type strictness of the (void*) parameter and ModSib<> parameter, so
+// that the programmer must be explicit in specifying desired operand size.
+//
+// IMPORTANT: This helper assumes the prefix opcode is written *always* -- regardless of
+// MMX or XMM register status.
 //
 template< u8 Prefix, u16 Opcode, typename DestRegType, typename SrcRegType, typename SrcOperandType >
 class SimdImpl_DestRegStrict
 {
 public:
-	__forceinline void operator()( const DestRegType& to, const SrcRegType& from ) const					{ writeXMMop( Prefix, Opcode, to, from, true ); }
-	__forceinline void operator()( const DestRegType& to, const SrcOperandType* from ) const				{ writeXMMop( Prefix, Opcode, to, from, true ); }
-	__forceinline void operator()( const DestRegType& to, const ModSibStrict<SrcOperandType>& from ) const	{ writeXMMop( Prefix, Opcode, to, from, true ); }
+	__forceinline void operator()( const DestRegType& to, const SrcRegType& from ) const					{ xOpWrite0F( Prefix, Opcode, to, from ); }
+	__forceinline void operator()( const DestRegType& to, const SrcOperandType* from ) const				{ xOpWrite0F( Prefix, Opcode, to, from ); }
+	__forceinline void operator()( const DestRegType& to, const ModSibStrict<SrcOperandType>& from ) const	{ xOpWrite0F( Prefix, Opcode, to, from ); }

 	SimdImpl_DestRegStrict() {} //GCWho?
 };
--- a/pcsx2/x86/ix86/implement/xmm/comparisons.h
+++ b/pcsx2/x86/ix86/implement/xmm/comparisons.h
@ -41,9 +41,9 @@ class SimdImpl_Compare
 protected:
 	template< u8 Prefix > struct Woot
 	{
-		__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const	{ writeXMMop( Prefix, 0xc2, to, from ); xWrite<u8>( CType ); }
-		__forceinline void operator()( const xRegisterSSE& to, const void* from ) const			{ writeXMMop( Prefix, 0xc2, to, from ); xWrite<u8>( CType ); }
-		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ writeXMMop( Prefix, 0xc2, to, from ); xWrite<u8>( CType ); }
+		__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const	{ xOpWrite0F( Prefix, 0xc2, to, from ); xWrite<u8>( CType ); }
+		__forceinline void operator()( const xRegisterSSE& to, const void* from ) const			{ xOpWrite0F( Prefix, 0xc2, to, from ); xWrite<u8>( CType ); }
+		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ xOpWrite0F( Prefix, 0xc2, to, from ); xWrite<u8>( CType ); }
 		Woot() {}
 	};

@ -128,4 +128,3 @@ public:
 	// packed min/max values in dest. (SSE operands only)
 	const SimdImpl_DestRegSSE<0x66,((Opcode2+3)<<8)|0x38> UD;
 };
-
--- a/pcsx2/x86/ix86/implement/xmm/moremovs.h
+++ b/pcsx2/x86/ix86/implement/xmm/moremovs.h
@ -30,10 +30,10 @@ protected:
 	struct Woot
 	{
 		Woot() {}
-		__forceinline void operator()( const xRegisterSSE& to, const void* from ) const			{ writeXMMop( Prefix, Opcode, to, from ); }
-		__forceinline void operator()( const void* to, const xRegisterSSE& from ) const			{ writeXMMop( Prefix, Opcode+1, from, to ); }
-		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ writeXMMop( Prefix, Opcode, to, from ); }
-		__forceinline void operator()( const ModSibBase& to, const xRegisterSSE& from ) const	{ writeXMMop( Prefix, Opcode+1, from, to ); }
+		__forceinline void operator()( const xRegisterSSE& to, const void* from ) const			{ xOpWrite0F( Prefix, Opcode, to, from ); }
+		__forceinline void operator()( const void* to, const xRegisterSSE& from ) const			{ xOpWrite0F( Prefix, Opcode+1, from, to ); }
+		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ xOpWrite0F( Prefix, Opcode, to, from ); }
+		__forceinline void operator()( const ModSibBase& to, const xRegisterSSE& from ) const	{ xOpWrite0F( Prefix, Opcode+1, from, to ); }
 	};

 public:
@ -51,26 +51,104 @@ template< u16 Opcode >
 class MovhlImpl_RtoR
 {
 public:
-	__forceinline void PS( const xRegisterSSE& to, const xRegisterSSE& from ) const			{ writeXMMop( Opcode, to, from ); }
-	__forceinline void PD( const xRegisterSSE& to, const xRegisterSSE& from ) const			{ writeXMMop( 0x66, Opcode, to, from ); }
+	__forceinline void PS( const xRegisterSSE& to, const xRegisterSSE& from ) const			{ xOpWrite0F( Opcode, to, from ); }
+	__forceinline void PD( const xRegisterSSE& to, const xRegisterSSE& from ) const			{ xOpWrite0F( 0x66, Opcode, to, from ); }

 	MovhlImpl_RtoR() {} //GCC.
 };

-// ------------------------------------------------------------------------
-template< u8 Prefix, u16 Opcode, u16 OpcodeAlt >
-class MovapsImplAll
+//////////////////////////////////////////////////////////////////////////////////////////
+// Legends in their own right: MOVAPS / MOVAPD / MOVUPS / MOVUPD
+//
+// All implementations of Unaligned Movs will, when possible, use aligned movs instead.
+// This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
+// which can be checked for alignment at runtime.
+// 
+template< u8 Prefix, bool isAligned >
+class SimdImpl_MoveSSE
 {
-public:
-	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const	{ if( to != from ) writeXMMop( Prefix, Opcode, to, from ); }
-	__forceinline void operator()( const xRegisterSSE& to, const void* from ) const			{ writeXMMop( Prefix, Opcode, to, from ); }
-	__forceinline void operator()( const void* to, const xRegisterSSE& from ) const			{ writeXMMop( Prefix, OpcodeAlt, from, to ); }
-	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ writeXMMop( Prefix, Opcode, to, from ); }
-	__forceinline void operator()( const ModSibBase& to, const xRegisterSSE& from ) const		{ writeXMMop( Prefix, OpcodeAlt, from, to ); }
+	static const u16 OpcodeA = 0x28;		// Aligned [aps] form
+	static const u16 OpcodeU = 0x10;		// unaligned [ups] form

-	MovapsImplAll() {} //GCC.
+public:
+	SimdImpl_MoveSSE() {} //GCC.
+
+	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const
+	{
+		if( to != from ) xOpWrite0F( Prefix, OpcodeA, to, from );
+	}
+
+	__forceinline void operator()( const xRegisterSSE& to, const void* from ) const	
+	{
+		xOpWrite0F( Prefix, (isAligned || ((uptr)from & 0x0f) == 0) ? OpcodeA : OpcodeU, to, from );
+	}
+
+	__forceinline void operator()( void* to, const xRegisterSSE& from ) const
+	{
+		xOpWrite0F( Prefix, (isAligned || ((uptr)to & 0x0f) == 0) ? OpcodeA+1 : OpcodeU+1, from, to );
+	}
+
+	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const
+	{
+		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
+		bool isReallyAligned = isAligned || ( ((from.Displacement & 0x0f) == 0) && from.Index.IsEmpty() && from.Base.IsEmpty() );
+		xOpWrite0F( Prefix, isReallyAligned ? OpcodeA : OpcodeU, to, from );
+	}
+
+	__forceinline void operator()( const ModSibBase& to, const xRegisterSSE& from ) const
+	{
+		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
+		bool isReallyAligned = isAligned || ( (to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty() );
+		xOpWrite0F( Prefix, isReallyAligned ? OpcodeA+1 : OpcodeU+1, from, to );
+	}
 };

+//////////////////////////////////////////////////////////////////////////////////////////
+// Implementations for MOVDQA / MOVDQU
+//
+template< u8 Prefix, bool isAligned >
+class SimdImpl_MoveDQ
+{
+	static const u8 PrefixA = 0x66;		// Aligned [aps] form
+	static const u8 PrefixU = 0xf3;		// unaligned [ups] form
+
+	static const u16 Opcode = 0x6f;
+	static const u16 Opcode_Alt = 0x7f; // alternate ModRM encoding (reverse src/dst)
+
+public:
+	SimdImpl_MoveDQ() {} //GCC.
+
+	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const
+	{
+		if( to != from ) xOpWrite0F( PrefixA, Opcode, to, from );
+	}
+
+	__forceinline void operator()( const xRegisterSSE& to, const void* from ) const	
+	{
+		xOpWrite0F( (isAligned || (from & 0x0f) == 0) ? PrefixA : PrefixU, Opcode, to, from );
+	}
+
+	__forceinline void operator()( const void* to, const xRegisterSSE& from ) const
+	{
+		xOpWrite0F( (isAligned || (from & 0x0f) == 0) ? PrefixA : PrefixU, Opcode_Alt, to, from );
+	}
+
+	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const
+	{
+		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
+		bool isReallyAligned = isAligned || ( (from.Displacement & 0x0f) == 0 && from.Index.IsEmpty() && from.Base.IsEmpty() );
+		xOpWrite0F( isReallyAligned ? PrefixA : PrefixU, Opcode, to, from );
+	}
+
+	__forceinline void operator()( const ModSibBase& to, const xRegisterSSE& from ) const
+	{
+		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
+		bool isReallyAligned = isAligned || ( (to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty() );
+		xOpWrite0F( isReallyAligned ? PrefixA : PrefixU, Opcode_Alt, to, from );
+	}
+};
+
+
 //////////////////////////////////////////////////////////////////////////////////////////
 //
 template< u8 AltPrefix, u16 OpcodeSSE >
@ -83,12 +161,79 @@ public:
 };

 //////////////////////////////////////////////////////////////////////////////////////////
+// Blend - Conditional copying of values in src into dest.
 //
 class SimdImpl_Blend
 {
+public:
+	// [SSE-4.1] Conditionally copies dword values from src to dest, depending on the
+	// mask bits in the immediate operand (bits [3:0]).  Each mask bit corresponds to a
+	// dword element in a 128-bit operand. 
+	//
+	// If a mask bit is 1, then the corresponding dword in the source operand is copied
+	// to dest, else the dword element in dest is left unchanged.
+	//
 	SimdImpl_DestRegImmSSE<0x66,0x0c3a> PS;
+
+	// [SSE-4.1] Conditionally copies quadword values from src to dest, depending on the
+	// mask bits in the immediate operand (bits [1:0]).  Each mask bit corresponds to a
+	// quadword element in a 128-bit operand. 
+	//
+	// If a mask bit is 1, then the corresponding dword in the source operand is copied
+	// to dest, else the dword element in dest is left unchanged.
+	//
 	SimdImpl_DestRegImmSSE<0x66,0x0d3a> PD;
 	
-	SimdImpl_DestRegImmSSE<0x66,0x1438> VPS;
-	SimdImpl_DestRegImmSSE<0x66,0x1538> VPD;
+	// [SSE-4.1] Conditionally copies dword values from src to dest, depending on the
+	// mask (bits [3:0]) in XMM0 (yes, the fixed register).  Each mask bit corresponds
+	// to a dword element in the 128-bit operand. 
+	//
+	// If a mask bit is 1, then the corresponding dword in the source operand is copied
+	// to dest, else the dword element in dest is left unchanged.
+	//
+	SimdImpl_DestRegSSE<0x66,0x1438> VPS;
+	
+	// [SSE-4.1] Conditionally copies quadword values from src to dest, depending on the
+	// mask (bits [1:0]) in XMM0 (yes, the fixed register).  Each mask bit corresponds
+	// to a quadword element in the 128-bit operand. 
+	//
+	// If a mask bit is 1, then the corresponding dword in the source operand is copied
+	// to dest, else the dword element in dest is left unchanged.
+	//
+	SimdImpl_DestRegSSE<0x66,0x1538> VPD;
 };
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Packed Move with Sign or Zero extension.
+//
+template< bool SignExtend >
+class SimdImpl_PMove
+{
+	static const u16 OpcodeBase = SignExtend ? 0x2038 : 0x3038;
+
+public:
+	// [SSE-4.1] Zero/Sign-extend the low byte values in src into word integers
+	// and store them in dest.
+	SimdImpl_DestRegStrict<0x66,OpcodeBase,xRegisterSSE,xRegisterSSE,u64> BW;
+
+	// [SSE-4.1] Zero/Sign-extend the low byte values in src into dword integers
+	// and store them in dest.
+	SimdImpl_DestRegStrict<0x66,OpcodeBase+0x100,xRegisterSSE,xRegisterSSE,u32> BD;
+
+	// [SSE-4.1] Zero/Sign-extend the low byte values in src into qword integers
+	// and store them in dest.
+	SimdImpl_DestRegStrict<0x66,OpcodeBase+0x200,xRegisterSSE,xRegisterSSE,u16> BQ;
+	
+	// [SSE-4.1] Zero/Sign-extend the low word values in src into dword integers
+	// and store them in dest.
+	SimdImpl_DestRegStrict<0x66,OpcodeBase+0x300,xRegisterSSE,xRegisterSSE,u64> WD;
+
+	// [SSE-4.1] Zero/Sign-extend the low word values in src into qword integers
+	// and store them in dest.
+	SimdImpl_DestRegStrict<0x66,OpcodeBase+0x400,xRegisterSSE,xRegisterSSE,u32> WQ;
+
+	// [SSE-4.1] Zero/Sign-extend the low dword values in src into qword integers
+	// and store them in dest.
+	SimdImpl_DestRegStrict<0x66,OpcodeBase+0x500,xRegisterSSE,xRegisterSSE,u64> DQ;
+};
+
--- a/pcsx2/x86/ix86/implement/xmm/shufflepack.h
+++ b/pcsx2/x86/ix86/implement/xmm/shufflepack.h
@ -26,9 +26,9 @@ class SimdImpl_Shuffle
 protected:
 	template< u8 Prefix > struct Woot
 	{
-		__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from, u8 cmptype ) const	{ writeXMMop( Prefix, OpcodeSSE, to, from ); xWrite<u8>( cmptype ); }
-		__forceinline void operator()( const xRegisterSSE& to, const void* from, u8 cmptype ) const			{ writeXMMop( Prefix, OpcodeSSE, to, from ); xWrite<u8>( cmptype ); }
-		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 cmptype ) const		{ writeXMMop( Prefix, OpcodeSSE, to, from ); xWrite<u8>( cmptype ); }
+		__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from, u8 cmptype ) const	{ xOpWrite0F( Prefix, OpcodeSSE, to, from ); xWrite<u8>( cmptype ); }
+		__forceinline void operator()( const xRegisterSSE& to, const void* from, u8 cmptype ) const			{ xOpWrite0F( Prefix, OpcodeSSE, to, from ); xWrite<u8>( cmptype ); }
+		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 cmptype ) const	{ xOpWrite0F( Prefix, OpcodeSSE, to, from ); xWrite<u8>( cmptype ); }
 		Woot() {}
 	};

@ -182,20 +182,17 @@ protected:
 		
 		__forceinline void operator()( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const
 		{
-			writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
-			xWrite<u8>( imm8 );
+			xOpWrite0F( 0x66, (Opcode<<8) | 0x3a, to, from, imm8 );
 		}

 		__forceinline void operator()( const xRegisterSSE& to, const void* from, u8 imm8 ) const
 		{
-			writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
-			xWrite<u8>( imm8 );
+			xOpWrite0F( 0x66, (Opcode<<8) | 0x3a, to, from, imm8 );
 		}

 		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const
 		{
-			writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
-			xWrite<u8>( imm8 );
+			xOpWrite0F( 0x66, (Opcode<<8) | 0x3a, to, from, imm8 );
 		}
 	};
 	
@ -203,28 +200,13 @@ public:
 	SimdImpl_PInsert() {}

 	// Operation can be performed on either MMX or SSE src operands.
-	template< typename T >
-	__forceinline void W( const xRegisterSIMD<T>& to, const xRegister32& from, u8 imm8 ) const
-	{
-		writeXMMop( 0x66, 0xc4, to, from );
-		xWrite<u8>( imm8 );
-	}
+	__forceinline void W( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const	{ xOpWrite0F( 0x66, 0xc4, to, from, imm8 ); }
+	__forceinline void W( const xRegisterSSE& to, const void* from, u8 imm8 ) const			{ xOpWrite0F( 0x66, 0xc4, to, from, imm8 ); }
+	__forceinline void W( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const	{ xOpWrite0F( 0x66, 0xc4, to, from, imm8 ); }

-	// Operation can be performed on either MMX or SSE src operands.
-	template< typename T >
-	__forceinline void W( const xRegisterSIMD<T>& to, const void* from, u8 imm8 ) const
-	{
-		writeXMMop( 0x66, 0xc4, to, from );
-		xWrite<u8>( imm8 );
-	}
-
-	// Operation can be performed on either MMX or SSE src operands.
-	template< typename T >
-	__forceinline void W( const xRegisterSIMD<T>& to, const ModSibBase& from, u8 imm8 ) const
-	{
-		writeXMMop( 0x66, 0xc4, to, from );
-		xWrite<u8>( imm8 );
-	}
+	__forceinline void W( const xRegisterMMX& to, const xRegister32& from, u8 imm8 ) const	{ xOpWrite0F( 0xc4, to, from, imm8 ); }
+	__forceinline void W( const xRegisterMMX& to, const void* from, u8 imm8 ) const			{ xOpWrite0F( 0xc4, to, from, imm8 ); }
+	__forceinline void W( const xRegisterMMX& to, const ModSibBase& from, u8 imm8 ) const	{ xOpWrite0F( 0xc4, to, from, imm8 ); }

 	// [SSE-4.1] 
 	const ByteDwordForms<0x20> B;
@ -250,20 +232,17 @@ protected:

 		__forceinline void operator()( const xRegister32& to, const xRegisterSSE& from, u8 imm8 ) const
 		{
-			writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
-			xWrite<u8>( imm8 );
+			xOpWrite0F( 0x66, (Opcode<<8) | 0x3a, to, from, imm8 );
 		}

 		__forceinline void operator()( void* dest, const xRegisterSSE& from, u8 imm8 ) const
 		{
-			writeXMMop( 0x66, (Opcode<<8) | 0x3a, from, dest );
-			xWrite<u8>( imm8 );
+			xOpWrite0F( 0x66, (Opcode<<8) | 0x3a, from, dest, imm8 );
 		}

 		__forceinline void operator()( const ModSibBase& dest, const xRegisterSSE& from, u8 imm8 ) const
 		{
-			writeXMMop( 0x66, (Opcode<<8) | 0x3a, from, dest );
-			xWrite<u8>( imm8 );
+			xOpWrite0F( 0x66, (Opcode<<8) | 0x3a, from, dest, imm8 );
 		}
 	};

@ -276,24 +255,11 @@ public:
 	//
 	// [SSE-4.1] Note: Indirect memory forms of this instruction are an SSE-4.1 extension!
 	//
-	template< typename T >
-	__forceinline void W( const xRegister32& to, const xRegisterSIMD<T>& from, u8 imm8 ) const
-	{
-		writeXMMop( 0x66, 0xc5, to, from, true );
-		xWrite<u8>( imm8 );
-	}
+	__forceinline void W( const xRegister32& to, const xRegisterSSE& from, u8 imm8 ) const		{ xOpWrite0F( 0x66, 0xc5, to, from, imm8 ); }
+	__forceinline void W( const xRegister32& to, const xRegisterMMX& from, u8 imm8 ) const		{ xOpWrite0F( 0xc5, to, from, imm8 ); }

-	__forceinline void W( void* dest, const xRegisterSSE& from, u8 imm8 ) const
-	{
-		writeXMMop( 0x66, 0x153a, from, dest );
-		xWrite<u8>( imm8 );
-	}
-
-	__forceinline void W( const ModSibBase& dest, const xRegisterSSE& from, u8 imm8 ) const
-	{
-		writeXMMop( 0x66, 0x153a, from, dest );
-		xWrite<u8>( imm8 );
-	}
+	__forceinline void W( void* dest, const xRegisterSSE& from, u8 imm8 ) const					{ xOpWrite0F( 0x66, 0x153a, from, dest, imm8 ); }
+	__forceinline void W( const ModSibBase& dest, const xRegisterSSE& from, u8 imm8 ) const		{ xOpWrite0F( 0x66, 0x153a, from, dest, imm8 ); }

 	// [SSE-4.1] Copies the byte element specified by imm8 from src to dest.  The upper bits
 	// of dest are zero-extended (cleared).  This can be used to extract any single packed
--- a/pcsx2/x86/ix86/ix86.cpp
+++ b/pcsx2/x86/ix86/ix86.cpp
@ -161,7 +161,40 @@ namespace Internal
 		xWriteDisp( regfield, (s32)address );
 	}
 	
-	// ------------------------------------------------------------------------
+	//////////////////////////////////////////////////////////////////////////////////////////
+	// emitter helpers for xmm instruction with prefixes, most of which are using
+	// the basic opcode format (items inside braces denote optional or conditional
+	// emission):
+	//
+	//   [Prefix] / 0x0f / [OpcodePrefix] / Opcode / ModRM+[SibSB]
+	//
+	// Prefixes are typically 0x66, 0xf2, or 0xf3.  OpcodePrefixes are either 0x38 or
+	// 0x3a [and other value will result in assertion failue].
+	//
+	__emitinline void xOpWrite0F( u8 prefix, u16 opcode, int instId, const ModSibBase& sib )
+	{
+		SimdPrefix( prefix, opcode );
+		EmitSibMagic( instId, sib );
+	}
+
+	__emitinline void xOpWrite0F( u8 prefix, u16 opcode, int instId, const void* data )
+	{
+		SimdPrefix( prefix, opcode );
+		xWriteDisp( instId, data );
+	}
+
+	__emitinline void xOpWrite0F( u16 opcode, int instId, const ModSibBase& sib )
+	{
+		xOpWrite0F( 0, opcode, instId, sib );
+	}
+
+	__emitinline void xOpWrite0F( u16 opcode, int instId, const void* data )
+	{
+		xOpWrite0F( 0, opcode, instId, data );
+	}
+
+
+	//////////////////////////////////////////////////////////////////////////////////////////
 	// returns TRUE if this instruction requires SIB to be encoded, or FALSE if the
 	// instruction ca be encoded as ModRm alone.
 	static __forceinline bool NeedsSibMagic( const ModSibBase& info )
@ -288,13 +321,13 @@ const MovExtendImplAll<true>  xMOVSX;
 const DwordShiftImplAll<false> xSHLD;
 const DwordShiftImplAll<true>  xSHRD;

-const Group8ImplAll<G8Type_BT> xBT;
-const Group8ImplAll<G8Type_BTR> xBTR;
-const Group8ImplAll<G8Type_BTS> xBTS;
-const Group8ImplAll<G8Type_BTC> xBTC;
+const Group8Impl<G8Type_BT> xBT;
+const Group8Impl<G8Type_BTR> xBTR;
+const Group8Impl<G8Type_BTS> xBTS;
+const Group8Impl<G8Type_BTC> xBTC;

-const BitScanImplAll<false> xBSF;
-const BitScanImplAll<true> xBSR;
+const BitScanImpl<0xbc> xBSF;
+const BitScanImpl<0xbd> xBSR;

 // ------------------------------------------------------------------------
 const CMovImplGeneric xCMOV;
@ -635,320 +668,4 @@ __emitinline void xBSWAP( const xRegister32& to )
 	write8( 0xC8 | to.Id );
 }

-
-//////////////////////////////////////////////////////////////////////////////////////////
-// MMX / XMM Instructions
-// (these will get put in their own file later)
-
-// ------------------------------------------------------------------------
-// SimdPrefix - If the lower byte of the opcode is 0x38 or 0x3a, then the opcode is
-// treated as a 16 bit value (in SSE 0x38 and 0x3a denote prefixes for extended SSE3/4
-// instructions).  Any other lower value assumes the upper value is 0 and ignored.
-// Non-zero upper bytes, when the lower byte is not the 0x38 or 0x3a prefix, will
-// generate an assertion.
-//
-__emitinline void Internal::SimdPrefix( u8 prefix, u16 opcode )
-{
-	const bool is16BitOpcode = ((opcode & 0xff) == 0x38) || ((opcode & 0xff) == 0x3a);
-
-	// If the lower byte is not a valid previx and the upper byte is non-zero it
-	// means we made a mistake!
-	if( !is16BitOpcode ) jASSUME( (opcode >> 8) == 0 );
-
-	if( prefix != 0 )
-	{
-		if( is16BitOpcode )
-			xWrite<u32>( (opcode<<16) | 0x0f00 | prefix );
-		else
-		{
-			xWrite<u16>( 0x0f00 | prefix );
-			xWrite<u8>( opcode );
-		}
-	}
-	else
-	{
-		if( is16BitOpcode )
-		{
-			xWrite<u8>( 0x0f );
-			xWrite<u16>( opcode );
-		}
-		else
-			xWrite<u16>( (opcode<<8) | 0x0f );
-	}
-}
-
-// [SSE-3]
-const SimdImpl_DestRegSSE<0xf3,0x12> xMOVSLDUP;
-// [SSE-3]
-const SimdImpl_DestRegSSE<0xf3,0x16> xMOVSHDUP;
-
-const MovapsImplAll< 0, 0x28, 0x29 > xMOVAPS; 
-const MovapsImplAll< 0, 0x10, 0x11 > xMOVUPS;
-const MovapsImplAll< 0x66, 0x28, 0x29 > xMOVAPD;
-const MovapsImplAll< 0x66, 0x10, 0x11 > xMOVUPD;
-
-#ifdef ALWAYS_USE_MOVAPS
-const MovapsImplAll< 0x66, 0x6f, 0x7f > xMOVDQA;
-const MovapsImplAll< 0xf3, 0x6f, 0x7f > xMOVDQU;
-#else
-const MovapsImplAll< 0, 0x28, 0x29 > xMOVDQA;
-const MovapsImplAll< 0, 0x10, 0x11 > xMOVDQU;
-#endif
-
-const MovhlImplAll<0x16> xMOVH;
-const MovhlImplAll<0x12> xMOVL;
-const MovhlImpl_RtoR<0x16> xMOVLH;
-const MovhlImpl_RtoR<0x12> xMOVHL;
-
-const SimdImpl_DestRegEither<0x66,0xdb> xPAND;
-const SimdImpl_DestRegEither<0x66,0xdf> xPANDN;
-const SimdImpl_DestRegEither<0x66,0xeb> xPOR;
-const SimdImpl_DestRegEither<0x66,0xef> xPXOR;
-
-const SimdImpl_AndNot xANDN;
-
-const SimdImpl_UcomI<0x66,0x2e> xUCOMI;
-const SimdImpl_rSqrt<0x53> xRCP;
-const SimdImpl_rSqrt<0x52> xRSQRT;
-const SimdImpl_Sqrt<0x51> xSQRT;
-
-const SimdImpl_MinMax<0x5f> xMAX;
-const SimdImpl_MinMax<0x5d> xMIN;
-const SimdImpl_Shuffle<0xc6> xSHUF;
-
-// ------------------------------------------------------------------------
-
-const SimdImpl_Compare<SSE2_Equal>		xCMPEQ;
-const SimdImpl_Compare<SSE2_Less>			xCMPLT;
-const SimdImpl_Compare<SSE2_LessOrEqual>	xCMPLE;
-const SimdImpl_Compare<SSE2_Unordered>	xCMPUNORD;
-const SimdImpl_Compare<SSE2_NotEqual>		xCMPNE;
-const SimdImpl_Compare<SSE2_NotLess>		xCMPNLT;
-const SimdImpl_Compare<SSE2_NotLessOrEqual> xCMPNLE;
-const SimdImpl_Compare<SSE2_Ordered>		xCMPORD;
-
-// ------------------------------------------------------------------------
-// SSE Conversion Operations, as looney as they are.
-// 
-// These enforce pointer strictness for Indirect forms, due to the otherwise completely confusing
-// nature of the functions.  (so if a function expects an m32, you must use (u32*) or ptr32[]).
-//
-const SimdImpl_DestRegStrict<0xf3,0xe6,xRegisterSSE,xRegisterSSE,u64>		xCVTDQ2PD;
-const SimdImpl_DestRegStrict<0x00,0x5b,xRegisterSSE,xRegisterSSE,u128>		xCVTDQ2PS;
-
-const SimdImpl_DestRegStrict<0xf2,0xe6,xRegisterSSE,xRegisterSSE,u128>		xCVTPD2DQ;
-const SimdImpl_DestRegStrict<0x66,0x2d,xRegisterMMX,xRegisterSSE,u128>		xCVTPD2PI;
-const SimdImpl_DestRegStrict<0x66,0x5a,xRegisterSSE,xRegisterSSE,u128>		xCVTPD2PS;
-
-const SimdImpl_DestRegStrict<0x66,0x2a,xRegisterSSE,xRegisterMMX,u64>		xCVTPI2PD;
-const SimdImpl_DestRegStrict<0x00,0x2a,xRegisterSSE,xRegisterMMX,u64>		xCVTPI2PS;
-
-const SimdImpl_DestRegStrict<0x66,0x5b,xRegisterSSE,xRegisterSSE,u128>		xCVTPS2DQ;
-const SimdImpl_DestRegStrict<0x00,0x5a,xRegisterSSE,xRegisterSSE,u64>		xCVTPS2PD;
-const SimdImpl_DestRegStrict<0x00,0x2d,xRegisterMMX,xRegisterSSE,u64>		xCVTPS2PI;
-
-const SimdImpl_DestRegStrict<0xf2,0x2d,xRegister32, xRegisterSSE,u64>		xCVTSD2SI;
-const SimdImpl_DestRegStrict<0xf2,0x5a,xRegisterSSE,xRegisterSSE,u64>		xCVTSD2SS;
-const SimdImpl_DestRegStrict<0xf2,0x2a,xRegisterMMX,xRegister32, u32>		xCVTSI2SD;
-const SimdImpl_DestRegStrict<0xf3,0x2a,xRegisterSSE,xRegister32, u32>		xCVTSI2SS;
-
-const SimdImpl_DestRegStrict<0xf3,0x5a,xRegisterSSE,xRegisterSSE,u32>		xCVTSS2SD;
-const SimdImpl_DestRegStrict<0xf3,0x2d,xRegister32, xRegisterSSE,u32>		xCVTSS2SI;
-
-const SimdImpl_DestRegStrict<0x66,0xe6,xRegisterSSE,xRegisterSSE,u128>		xCVTTPD2DQ;
-const SimdImpl_DestRegStrict<0x66,0x2c,xRegisterMMX,xRegisterSSE,u128>		xCVTTPD2PI;
-const SimdImpl_DestRegStrict<0xf3,0x5b,xRegisterSSE,xRegisterSSE,u128>		xCVTTPS2DQ;
-const SimdImpl_DestRegStrict<0x00,0x2c,xRegisterMMX,xRegisterSSE,u64>		xCVTTPS2PI;
-
-const SimdImpl_DestRegStrict<0xf2,0x2c,xRegister32, xRegisterSSE,u64>		xCVTTSD2SI;
-const SimdImpl_DestRegStrict<0xf3,0x2c,xRegister32, xRegisterSSE,u32>		xCVTTSS2SI;
-
-// ------------------------------------------------------------------------
-
-const SimdImpl_Shift<0xd0, 2> xPSRL;
-const SimdImpl_Shift<0xf0, 6> xPSLL;
-const SimdImpl_ShiftWithoutQ<0xe0, 4> xPSRA;
-
-const SimdImpl_AddSub<0xdc, 0xd4> xPADD;
-const SimdImpl_AddSub<0xd8, 0xfb> xPSUB;
-const SimdImpl_PMinMax<0xde,0x3c> xPMAX;
-const SimdImpl_PMinMax<0xda,0x38> xPMIN;
-
-const SimdImpl_PMul xPMUL;
-const SimdImpl_PCompare xPCMP;
-const SimdImpl_PShuffle xPSHUF;
-const SimdImpl_PUnpack xPUNPCK;
-const SimdImpl_Unpack xUNPCK;
-const SimdImpl_Pack xPACK;
-
-const SimdImpl_PAbsolute xPABS;
-const SimdImpl_PSign xPSIGN;
-const SimdImpl_PInsert xPINSR;
-const SimdImpl_PExtract xPEXTR;
-const SimdImpl_PMultAdd xPMADD;
-const SimdImpl_HorizAdd xHADD;
-
-
-//////////////////////////////////////////////////////////////////////////////////////////
-//
-
-__emitinline void xEMMS()
-{
-	xWrite<u16>( 0x770F );
-}
-
-// Store Streaming SIMD Extension Control/Status to Mem32.
-__emitinline void xSTMXCSR( u32* dest )
-{
-	SimdPrefix( 0, 0xae );
-	xWriteDisp( 3, dest );
-}
-
-// Load Streaming SIMD Extension Control/Status from Mem32.
-__emitinline void xLDMXCSR( const u32* src )
-{
-	SimdPrefix( 0, 0xae );
-	xWriteDisp( 2, src );
-}
-
-
-// Moves from XMM to XMM, with the *upper 64 bits* of the destination register
-// being cleared to zero.
-__forceinline void xMOVQZX( const xRegisterSSE& to, const xRegisterSSE& from )	{ writeXMMop( 0xf3, 0x7e, to, from ); }
-
-// Moves from XMM to XMM, with the *upper 64 bits* of the destination register
-// being cleared to zero.
-__forceinline void xMOVQZX( const xRegisterSSE& to, const ModSibBase& src )		{ writeXMMop( 0xf3, 0x7e, to, src ); }
-
-// Moves from XMM to XMM, with the *upper 64 bits* of the destination register
-// being cleared to zero.
-__forceinline void xMOVQZX( const xRegisterSSE& to, const void* src )			{ writeXMMop( 0xf3, 0x7e, to, src ); }
-
-// Moves lower quad of XMM to ptr64 (no bits are cleared)
-__forceinline void xMOVQ( const ModSibBase& dest, const xRegisterSSE& from )	{ writeXMMop( 0x66, 0xd6, from, dest ); }
-// Moves lower quad of XMM to ptr64 (no bits are cleared)
-__forceinline void xMOVQ( void* dest, const xRegisterSSE& from )				{ writeXMMop( 0x66, 0xd6, from, dest ); }
-
-__forceinline void xMOVQ( const xRegisterMMX& to, const xRegisterMMX& from )	{ if( to != from ) writeXMMop( 0x6f, to, from ); }
-__forceinline void xMOVQ( const xRegisterMMX& to, const ModSibBase& src )		{ writeXMMop( 0x6f, to, src ); }
-__forceinline void xMOVQ( const xRegisterMMX& to, const void* src )				{ writeXMMop( 0x6f, to, src ); }
-__forceinline void xMOVQ( const ModSibBase& dest, const xRegisterMMX& from )	{ writeXMMop( 0x7f, from, dest ); }
-__forceinline void xMOVQ( void* dest, const xRegisterMMX& from )				{ writeXMMop( 0x7f, from, dest ); }
-
-// This form of xMOVQ is Intel's adeptly named 'MOVQ2DQ'
-__forceinline void xMOVQ( const xRegisterSSE& to, const xRegisterMMX& from )	{ writeXMMop( 0xf3, 0xd6, to, from ); }
-
-// This form of xMOVQ is Intel's adeptly named 'MOVDQ2Q'
-__forceinline void xMOVQ( const xRegisterMMX& to, const xRegisterSSE& from )
-{
-	// Manual implementation of this form of MOVQ, since its parameters are unique in a way
-	// that breaks the template inference of writeXMMop();
-
-	SimdPrefix( 0xf2, 0xd6 );
-	ModRM_Direct( to.Id, from.Id );
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-//
-
-#define IMPLEMENT_xMOVS( ssd, prefix ) \
-	__forceinline void xMOV##ssd( const xRegisterSSE& to, const xRegisterSSE& from )	{ if( to != from ) writeXMMop( prefix, 0x10, to, from ); } \
-	__forceinline void xMOV##ssd##ZX( const xRegisterSSE& to, const void* from )		{ writeXMMop( prefix, 0x10, to, from ); } \
-	__forceinline void xMOV##ssd##ZX( const xRegisterSSE& to, const ModSibBase& from )	{ writeXMMop( prefix, 0x10, to, from ); } \
-	__forceinline void xMOV##ssd( const void* to, const xRegisterSSE& from )			{ writeXMMop( prefix, 0x11, from, to ); } \
-	__forceinline void xMOV##ssd( const ModSibBase& to, const xRegisterSSE& from )		{ writeXMMop( prefix, 0x11, from, to ); }
-
-IMPLEMENT_xMOVS( SS, 0xf3 )
-IMPLEMENT_xMOVS( SD, 0xf2 )
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// Non-temporal movs only support a register as a target (ie, load form only, no stores)
-//
-
-__forceinline void xMOVNTDQA( const xRegisterSSE& to, const void* from )
-{
-	xWrite<u32>( 0x2A380f66 );
-	xWriteDisp( to.Id, from );
-}
-
-__forceinline void xMOVNTDQA( const xRegisterSSE& to, const ModSibBase& from )
-{
-	xWrite<u32>( 0x2A380f66 );
-	EmitSibMagic( to.Id, from );
-}
-
-__forceinline void xMOVNTDQ( void* to, const xRegisterSSE& from )			{ writeXMMop( 0x66, 0xe7, from, to ); }
-__forceinline void xMOVNTDQA( const ModSibBase& to, const xRegisterSSE& from )	{ writeXMMop( 0x66, 0xe7, from, to ); }
-
-__forceinline void xMOVNTPD( void* to, const xRegisterSSE& from )			{ writeXMMop( 0x66, 0x2b, from, to ); }
-__forceinline void xMOVNTPD( const ModSibBase& to, const xRegisterSSE& from )	{ writeXMMop( 0x66, 0x2b, from, to ); }
-__forceinline void xMOVNTPS( void* to, const xRegisterSSE& from )			{ writeXMMop( 0x2b, from, to ); }
-__forceinline void xMOVNTPS( const ModSibBase& to, const xRegisterSSE& from )	{ writeXMMop( 0x2b, from, to ); }
-
-__forceinline void xMOVNTQ( void* to, const xRegisterMMX& from )			{ writeXMMop( 0xe7, from, to ); }
-__forceinline void xMOVNTQ( const ModSibBase& to, const xRegisterMMX& from )	{ writeXMMop( 0xe7, from, to ); }
-
-__forceinline void xMOVMSKPS( const xRegister32& to, const xRegisterSSE& from)	{ writeXMMop( 0x50, to, from ); }
-__forceinline void xMOVMSKPD( const xRegister32& to, const xRegisterSSE& from)	{ writeXMMop( 0x66, 0x50, to, from, true ); }
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// INSERTPS / EXTRACTPS   [SSE4.1 only!]
-//
-// [TODO] these might be served better as classes, especially if other instructions use
-// the M32,sse,imm form (I forget offhand if any do).
-
-
-// [SSE-4.1] Insert a single-precision floating-point value from src into a specified
-// location in dest, and selectively zero out the data elements in dest according to
-// the mask  field in the immediate byte. The source operand can be a memory location
-// (32 bits) or an XMM register (lower 32 bits used).
-//
-// Imm8 provides three fields:
-//  * COUNT_S: The value of Imm8[7:6] selects the dword element from src.  It is 0 if
-//    the source is a memory operand.
-//  * COUNT_D: The value of Imm8[5:4] selects the target dword element in dest.
-//  * ZMASK: Each bit of Imm8[3:0] selects a dword element in dest to  be written
-//    with 0.0 if set to 1.
-//
-__emitinline void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 )
-{
-	writeXMMop( 0x66, 0x213a, to, from );
-	xWrite<u8>( imm8 );
-}
-
-__emitinline void xINSERTPS( const xRegisterSSE& to, const u32* from, u8 imm8 )
-{
-	writeXMMop( 0x66, 0x213a, to, from );
-	xWrite<u8>( imm8 );
-}
-
-__emitinline void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 )
-{
-	writeXMMop( 0x66, 0x213a, to, from );
-	xWrite<u8>( imm8 );
-}
-
-// [SSE-4.1] Extract a single-precision floating-point value from src at an offset
-// determined by imm8[1-0]*32. The extracted single precision floating-point value
-// is stored into the low 32-bits of dest (or at a 32-bit memory pointer).
-//
-__emitinline void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 )
-{
-	writeXMMop( 0x66, 0x173a, to, from, true );
-	xWrite<u8>( imm8 );
-}
-
-__emitinline void xEXTRACTPS( u32* dest, const xRegisterSSE& from, u8 imm8 )
-{
-	writeXMMop( 0x66, 0x173a, from, dest, true );
-	xWrite<u8>( imm8 );
-}
-
-__emitinline void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 )
-{
-	writeXMMop( 0x66, 0x173a, from, dest, true );
-	xWrite<u8>( imm8 );
-}
-
-
 }
--- a/pcsx2/x86/ix86/ix86_instructions.h
+++ b/pcsx2/x86/ix86/ix86_instructions.h
@ -86,16 +86,16 @@ namespace x86Emitter
 	extern const Internal::DwordShiftImplAll<false> xSHLD;
 	extern const Internal::DwordShiftImplAll<true>  xSHRD;

-	extern const Internal::Group8ImplAll<Internal::G8Type_BT> xBT;
-	extern const Internal::Group8ImplAll<Internal::G8Type_BTR> xBTR;
-	extern const Internal::Group8ImplAll<Internal::G8Type_BTS> xBTS;
-	extern const Internal::Group8ImplAll<Internal::G8Type_BTC> xBTC;
+	extern const Internal::Group8Impl<Internal::G8Type_BT> xBT;
+	extern const Internal::Group8Impl<Internal::G8Type_BTR> xBTR;
+	extern const Internal::Group8Impl<Internal::G8Type_BTS> xBTS;
+	extern const Internal::Group8Impl<Internal::G8Type_BTC> xBTC;

 	extern const Internal::JmpCallImplAll<true> xJMP;
 	extern const Internal::JmpCallImplAll<false> xCALL;

-	extern const Internal::BitScanImplAll<false> xBSF;
-	extern const Internal::BitScanImplAll<true> xBSR;
+	extern const Internal::BitScanImpl<0xbc> xBSF;
+	extern const Internal::BitScanImpl<0xbd> xBSR;

 	// ------------------------------------------------------------------------
 	extern const Internal::CMovImplGeneric xCMOV;
@ -299,95 +299,28 @@ namespace x86Emitter
 	typedef xForwardJPO<s8>		xForwardJPO8;
 	typedef xForwardJPO<s32>	xForwardJPO32;

-	//////////////////////////////////////////////////////////////////////////////////////////
-	// MMX Mov Instructions (MOVD, MOVQ, MOVSS).
-	//
-	// Notes:
-	//  * Some of the functions have been renamed to more clearly reflect what they actually
-	//    do.  Namely we've affixed "ZX" to several MOVs that take a register as a destination
-	//    since that's what they do (MOVD clears upper 32/96 bits, etc).
-	//
-	
-	// ------------------------------------------------------------------------
-	// MOVD has valid forms for MMX and XMM registers.
-	//
-	template< typename T >
-	__emitinline void xMOVDZX( const xRegisterSIMD<T>& to, const xRegister32& from )
-	{
-		Internal::writeXMMop( 0x66, 0x6e, to, from );
-	}
-
-	template< typename T >
-	__emitinline void xMOVDZX( const xRegisterSIMD<T>& to, const void* src )
-	{
-		Internal::writeXMMop( 0x66, 0x6e, to, src );
-	}
-
-	template< typename T >
-	void xMOVDZX( const xRegisterSIMD<T>& to, const ModSibBase& src )
-	{
-		Internal::writeXMMop( 0x66, 0x6e, to, src );
-	}
-
-	template< typename T >
-	__emitinline void xMOVD( const xRegister32& to, const xRegisterSIMD<T>& from )
-	{
-		Internal::writeXMMop( 0x66, 0x7e, from, to );
-	}
-
-	template< typename T >
-	__emitinline void xMOVD( void* dest, const xRegisterSIMD<T>& from )
-	{
-		Internal::writeXMMop( 0x66, 0x7e, from, dest );
-	}
-
-	template< typename T >
-	void xMOVD( const ModSibBase& dest, const xRegisterSIMD<T>& from )
-	{
-		Internal::writeXMMop( 0x66, 0x7e, from, dest );
-	}
-
-
-	// ------------------------------------------------------------------------
-
-	// xMASKMOV:
-	// Selectively write bytes from mm1/xmm1 to memory location using the byte mask in mm2/xmm2.
-	// The default memory location is specified by DS:EDI.  The most significant bit in each byte
-	// of the mask operand determines whether the corresponding byte in the source operand is
-	// written to the corresponding byte location in memory.
-
-	template< typename T >
-	static __forceinline void xMASKMOV( const xRegisterSIMD<T>& to, const xRegisterSIMD<T>& from )	{ Internal::writeXMMop( 0x66, 0xf7, to, from ); }
-
-	// xPMOVMSKB:
-	// Creates a mask made up of the most significant bit of each byte of the source 
-	// operand and stores the result in the low byte or word of the destination operand.
-	// Upper bits of the destination are cleared to zero.
-	//
-	// When operating on a 64-bit (MMX) source, the byte mask is 8 bits; when operating on
-	// 128-bit (SSE) source, the byte mask is 16-bits.
-	//
-	template< typename T >
-	static __forceinline void xPMOVMSKB( const xRegister32& to, const xRegisterSIMD<T>& from )	{ Internal::writeXMMop( 0x66, 0xd7, to, from ); }
-	
-	// [sSSE-3] Concatenates dest and source operands into an intermediate composite,
-	// shifts the composite at byte granularity to the right by a constant immediate,
-	// and extracts the right-aligned result into the destination.
-	//
-	template< typename T >
-	static __forceinline void xPALIGNR( const xRegisterSIMD<T>& to, const xRegisterSIMD<T>& from, u8 imm8 )
-	{
-		Internal::writeXMMop( 0x66, 0x0f3a, to, from );
-		xWrite<u8>( imm8 );
-	}
-
-
 	// ------------------------------------------------------------------------

 	extern void xEMMS();
 	extern void xSTMXCSR( u32* dest );
 	extern void xLDMXCSR( const u32* src );

+	extern void xMOVDZX( const xRegisterSSE& to, const xRegister32& from );
+	extern void xMOVDZX( const xRegisterSSE& to, const void* src );
+	extern void xMOVDZX( const xRegisterSSE& to, const ModSibBase& src );
+
+	extern void xMOVDZX( const xRegisterMMX& to, const xRegister32& from );
+	extern void xMOVDZX( const xRegisterMMX& to, const void* src );
+	extern void xMOVDZX( const xRegisterMMX& to, const ModSibBase& src );
+
+	extern void xMOVD( const xRegister32& to, const xRegisterSSE& from );
+	extern void xMOVD( void* dest, const xRegisterSSE& from );
+	extern void xMOVD( const ModSibBase& dest, const xRegisterSSE& from );
+
+	extern void xMOVD( const xRegister32& to, const xRegisterMMX& from );
+	extern void xMOVD( void* dest, const xRegisterMMX& from );
+	extern void xMOVD( const ModSibBase& dest, const xRegisterMMX& from );
+
 	extern void xMOVQ( const xRegisterMMX& to, const xRegisterMMX& from );
 	extern void xMOVQ( const xRegisterMMX& to, const xRegisterSSE& from );
 	extern void xMOVQ( const xRegisterSSE& to, const xRegisterMMX& from );
@ -430,31 +363,28 @@ namespace x86Emitter
 	extern void xMOVMSKPS( const xRegister32& to, const xRegisterSSE& from );
 	extern void xMOVMSKPD( const xRegister32& to, const xRegisterSSE& from );

-	extern void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 );
-	extern void xINSERTPS( const xRegisterSSE& to, const u32* from, u8 imm8 );
-	extern void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 );
-
-	extern void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 );
-	extern void xEXTRACTPS( u32* dest, const xRegisterSSE& from, u8 imm8 );
-	extern void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 );
+	extern void xMASKMOV( const xRegisterSSE& to, const xRegisterSSE& from );
+	extern void xMASKMOV( const xRegisterMMX& to, const xRegisterMMX& from );
+	extern void xPMOVMSKB( const xRegister32& to, const xRegisterSSE& from );
+	extern void xPMOVMSKB( const xRegister32& to, const xRegisterMMX& from );
+	extern void xPALIGNR( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 );
+	extern void xPALIGNR( const xRegisterMMX& to, const xRegisterMMX& from, u8 imm8 );

 	// ------------------------------------------------------------------------

-	extern const Internal::SimdImpl_DestRegSSE<0xf3,0x12> xMOVSLDUP;
-	extern const Internal::SimdImpl_DestRegSSE<0xf3,0x16> xMOVSHDUP;
-
-	extern const Internal::MovapsImplAll<0, 0x28, 0x29> xMOVAPS;
-	extern const Internal::MovapsImplAll<0, 0x10, 0x11> xMOVUPS;
-
-	extern const Internal::MovapsImplAll<0x66, 0x28, 0x29> xMOVAPD;
-	extern const Internal::MovapsImplAll<0x66, 0x10, 0x11> xMOVUPD;
+	extern const Internal::SimdImpl_MoveSSE<0x00,true> xMOVAPS;
+	extern const Internal::SimdImpl_MoveSSE<0x00,false> xMOVUPS;

 #ifdef ALWAYS_USE_MOVAPS
-	extern const Internal::MovapsImplAll<0x66, 0x6f, 0x7f> xMOVDQA;
-	extern const Internal::MovapsImplAll<0xf3, 0x6f, 0x7f> xMOVDQU;
+	extern const Internal::SimdImpl_MoveSSE<0,true> xMOVDQA;
+	extern const Internal::SimdImpl_MoveSSE<0,false> xMOVDQU;
+	extern const Internal::SimdImpl_MoveSSE<0,true> xMOVAPD;
+	extern const Internal::SimdImpl_MoveSSE<0,false> xMOVUPD;
 #else
-	extern const Internal::MovapsImplAll<0, 0x28, 0x29> xMOVDQA;
-	extern const Internal::MovapsImplAll<0, 0x10, 0x11> xMOVDQU;
+	extern const Internal::SimdImpl_MoveDQ<0x66, 0x6f, 0x7f> xMOVDQA;
+	extern const Internal::SimdImpl_MoveDQ<0xf3, 0x6f, 0x7f> xMOVDQU;
+	extern const Internal::SimdImpl_MoveSSE<0x66,true> xMOVAPD;
+	extern const Internal::SimdImpl_MoveSSE<0x66,false> xMOVUPD;
 #endif

 	extern const Internal::MovhlImpl_RtoR<0x16> xMOVLH;
@ -463,6 +393,17 @@ namespace x86Emitter
 	extern const Internal::MovhlImplAll<0x16> xMOVH;
 	extern const Internal::MovhlImplAll<0x12> xMOVL;

+	extern const Internal::SimdImpl_DestRegSSE<0xf3,0x12> xMOVSLDUP;
+	extern const Internal::SimdImpl_DestRegSSE<0xf3,0x16> xMOVSHDUP;
+
+	extern void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 );
+	extern void xINSERTPS( const xRegisterSSE& to, const u32* from, u8 imm8 );
+	extern void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 );
+
+	extern void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 );
+	extern void xEXTRACTPS( u32* dest, const xRegisterSSE& from, u8 imm8 );
+	extern void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 );
+
 	// ------------------------------------------------------------------------
 	
 	extern const Internal::SimdImpl_DestRegEither<0x66,0xdb> xPAND;
@ -483,6 +424,8 @@ namespace x86Emitter

 	// ------------------------------------------------------------------------

+	extern const Internal::SimdImpl_DestRegSSE<0x66,0x1738> xPTEST;
+	
 	extern const Internal::SimdImpl_Compare<SSE2_Equal>			xCMPEQ;
 	extern const Internal::SimdImpl_Compare<SSE2_Less>			xCMPLT;
 	extern const Internal::SimdImpl_Compare<SSE2_LessOrEqual>	xCMPLE;
@ -527,8 +470,8 @@ namespace x86Emitter
 	
 	// ------------------------------------------------------------------------
 	
-	extern const Internal::SimdImpl_Shift<0xd0, 2> xPSRL;
 	extern const Internal::SimdImpl_Shift<0xf0, 6> xPSLL;
+	extern const Internal::SimdImpl_Shift<0xd0, 2> xPSRL;
 	extern const Internal::SimdImpl_ShiftWithoutQ<0xe0, 4> xPSRA;

 	extern const Internal::SimdImpl_AddSub<0xdc, 0xd4> xPADD;
@ -550,5 +493,12 @@ namespace x86Emitter
 	extern const Internal::SimdImpl_PMultAdd xPMADD;
 	extern const Internal::SimdImpl_HorizAdd xHADD;

+	extern const Internal::SimdImpl_Blend xBLEND;
+	extern const Internal::SimdImpl_DotProduct xDP;
+	extern const Internal::SimdImpl_Round xROUND;
+	
+	extern const Internal::SimdImpl_PMove<true> xPMOVSX;
+	extern const Internal::SimdImpl_PMove<false> xPMOVZX;
+
 }

--- a/pcsx2/x86/ix86/ix86_legacy_mmx.cpp
+++ b/pcsx2/x86/ix86/ix86_legacy_mmx.cpp
@ -1,124 +0,0 @@
-/*  Pcsx2 - Pc Ps2 Emulator
- *  Copyright (C) 2002-2009  Pcsx2 Team
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *  
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *  
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
- */
-
-#include "PrecompiledHeader.h"
-#include "ix86_legacy_internal.h"
-
-//------------------------------------------------------------------
-// MMX instructions
-//
-// note: r64 = mm
-//------------------------------------------------------------------
-
-using namespace x86Emitter;
-
-emitterT void MOVQMtoR( x86MMXRegType to, uptr from )							{ xMOVQ( xRegisterMMX(to), (void*)from ); }
-emitterT void MOVQRtoM( uptr to, x86MMXRegType from )							{ xMOVQ( (void*)to, xRegisterMMX(from) ); }
-emitterT void MOVQRtoR( x86MMXRegType to, x86MMXRegType from )					{ xMOVQ( xRegisterMMX(to), xRegisterMMX(from) ); }
-emitterT void MOVQRmtoR( x86MMXRegType to, x86IntRegType from, int offset )		{ xMOVQ( xRegisterMMX(to), ptr[xAddressReg(from)+offset] ); }
-emitterT void MOVQRtoRm( x86IntRegType to, x86MMXRegType from, int offset )		{ xMOVQ( ptr[xAddressReg(to)+offset], xRegisterMMX(from) ); }
-
-emitterT void MOVDMtoMMX( x86MMXRegType to, uptr from )							{ xMOVDZX( xRegisterMMX(to), (void*)from ); }
-emitterT void MOVDMMXtoM( uptr to, x86MMXRegType from )							{ xMOVD( (void*)to, xRegisterMMX(from) ); }
-emitterT void MOVD32RtoMMX( x86MMXRegType to, x86IntRegType from )				{ xMOVDZX( xRegisterMMX(to), xRegister32(from) ); }
-emitterT void MOVD32RmtoMMX( x86MMXRegType to, x86IntRegType from, int offset )	{ xMOVDZX( xRegisterMMX(to), ptr[xAddressReg(from)+offset] ); }
-emitterT void MOVD32MMXtoR( x86IntRegType to, x86MMXRegType from )				{ xMOVD( xRegister32(to), xRegisterMMX(from) ); }
-emitterT void MOVD32MMXtoRm( x86IntRegType to, x86MMXRegType from, int offset )	{ xMOVD( ptr[xAddressReg(to)+offset], xRegisterMMX(from) ); }
-
-emitterT void PMOVMSKBMMXtoR(x86IntRegType to, x86MMXRegType from)				{ xPMOVMSKB( xRegister32(to), xRegisterMMX(from) ); }
-emitterT void MASKMOVQRtoR(x86MMXRegType to, x86MMXRegType from)				{ xMASKMOV( xRegisterMMX(to), xRegisterMMX(from) ); }
-
-#define DEFINE_LEGACY_LOGIC_OPCODE( mod ) \
-	emitterT void P##mod##RtoR( x86MMXRegType to, x86MMXRegType from )				{ xP##mod( xRegisterMMX(to), xRegisterMMX(from) ); } \
-	emitterT void P##mod##MtoR( x86MMXRegType to, uptr from )						{ xP##mod( xRegisterMMX(to), (void*)from ); } \
-	emitterT void SSE2_P##mod##_XMM_to_XMM( x86SSERegType to, x86SSERegType from )	{ xP##mod( xRegisterSSE(to), xRegisterSSE(from) ); } \
-	emitterT void SSE2_P##mod##_M128_to_XMM( x86SSERegType to, uptr from )			{ xP##mod( xRegisterSSE(to), (void*)from ); }
-
-#define DEFINE_LEGACY_ARITHMETIC( mod, sub ) \
-	emitterT void P##mod##sub##RtoR( x86MMXRegType to, x86MMXRegType from )			{ xP##mod.sub( xRegisterMMX(to), xRegisterMMX(from) ); } \
-	emitterT void P##mod##sub##MtoR( x86MMXRegType to, uptr from )					{ xP##mod.sub( xRegisterMMX(to), (void*)from ); } \
-	emitterT void SSE2_P##mod##sub##_XMM_to_XMM( x86SSERegType to, x86SSERegType from )	{ xP##mod.sub( xRegisterSSE(to), xRegisterSSE(from) ); } \
-	emitterT void SSE2_P##mod##sub##_M128_to_XMM( x86SSERegType to, uptr from )			{ xP##mod.sub( xRegisterSSE(to), (void*)from ); }
-
-#define DEFINE_LEGACY_SHIFT_STUFF( mod, sub ) \
-	emitterT void P##mod##sub##RtoR( x86MMXRegType to, x86MMXRegType from )			{ xP##mod.sub( xRegisterMMX(to), xRegisterMMX(from) ); } \
-	emitterT void P##mod##sub##MtoR( x86MMXRegType to, uptr from )					{ xP##mod.sub( xRegisterMMX(to), (void*)from ); } \
-	emitterT void P##mod##sub##ItoR( x86MMXRegType to, u8 imm )						{ xP##mod.sub( xRegisterMMX(to), imm ); } \
-	emitterT void SSE2_P##mod##sub##_XMM_to_XMM( x86SSERegType to, x86SSERegType from )	{ xP##mod.sub( xRegisterSSE(to), xRegisterSSE(from) ); } \
-	emitterT void SSE2_P##mod##sub##_M128_to_XMM( x86SSERegType to, uptr from )			{ xP##mod.sub( xRegisterSSE(to), (void*)from ); } \
-	emitterT void SSE2_P##mod##sub##_I8_to_XMM( x86SSERegType to, u8 imm )				{ xP##mod.sub( xRegisterSSE(to), imm ); }
-
-#define DEFINE_LEGACY_SHIFT_OPCODE( mod ) \
-	DEFINE_LEGACY_SHIFT_STUFF( mod, Q ) \
-	DEFINE_LEGACY_SHIFT_STUFF( mod, D ) \
-	DEFINE_LEGACY_SHIFT_STUFF( mod, W ) \
-	emitterT void SSE2_P##mod##DQ_I8_to_XMM( x86MMXRegType to, u8 imm )					{ xP##mod.DQ( xRegisterSSE(to), imm ); }
-
-DEFINE_LEGACY_LOGIC_OPCODE( AND )
-DEFINE_LEGACY_LOGIC_OPCODE( ANDN )
-DEFINE_LEGACY_LOGIC_OPCODE( OR )
-DEFINE_LEGACY_LOGIC_OPCODE( XOR )
-
-DEFINE_LEGACY_SHIFT_OPCODE( SLL )
-DEFINE_LEGACY_SHIFT_OPCODE( SRL )
-DEFINE_LEGACY_SHIFT_STUFF( SRA, D )
-DEFINE_LEGACY_SHIFT_STUFF( SRA, W )
-
-DEFINE_LEGACY_ARITHMETIC( ADD, B )
-DEFINE_LEGACY_ARITHMETIC( ADD, W )
-DEFINE_LEGACY_ARITHMETIC( ADD, D )
-DEFINE_LEGACY_ARITHMETIC( ADD, Q )
-DEFINE_LEGACY_ARITHMETIC( ADD, SB )
-DEFINE_LEGACY_ARITHMETIC( ADD, SW )
-DEFINE_LEGACY_ARITHMETIC( ADD, USB )
-DEFINE_LEGACY_ARITHMETIC( ADD, USW )
-
-DEFINE_LEGACY_ARITHMETIC( SUB, B )
-DEFINE_LEGACY_ARITHMETIC( SUB, W )
-DEFINE_LEGACY_ARITHMETIC( SUB, D )
-DEFINE_LEGACY_ARITHMETIC( SUB, Q )
-DEFINE_LEGACY_ARITHMETIC( SUB, SB )
-DEFINE_LEGACY_ARITHMETIC( SUB, SW )
-DEFINE_LEGACY_ARITHMETIC( SUB, USB )
-DEFINE_LEGACY_ARITHMETIC( SUB, USW )
-
-DEFINE_LEGACY_ARITHMETIC( CMP, EQB );
-DEFINE_LEGACY_ARITHMETIC( CMP, EQW );
-DEFINE_LEGACY_ARITHMETIC( CMP, EQD );
-DEFINE_LEGACY_ARITHMETIC( CMP, GTB );
-DEFINE_LEGACY_ARITHMETIC( CMP, GTW );
-DEFINE_LEGACY_ARITHMETIC( CMP, GTD );
-
-DEFINE_LEGACY_ARITHMETIC( UNPCK, HDQ );
-DEFINE_LEGACY_ARITHMETIC( UNPCK, LDQ );
-DEFINE_LEGACY_ARITHMETIC( UNPCK, HBW );
-DEFINE_LEGACY_ARITHMETIC( UNPCK, LBW );
-
-DEFINE_LEGACY_ARITHMETIC( UNPCK, LWD );
-DEFINE_LEGACY_ARITHMETIC( UNPCK, HWD );
-
-
-emitterT void PMULUDQMtoR( x86MMXRegType to, uptr from )					{ xPMUL.UDQ( xRegisterMMX( to ), (void*)from ); }
-emitterT void PMULUDQRtoR( x86MMXRegType to, x86MMXRegType from )			{ xPMUL.UDQ( xRegisterMMX( to ), xRegisterMMX( from ) ); }
-
-emitterT void PSHUFWRtoR(x86MMXRegType to, x86MMXRegType from, u8 imm8)		{ xPSHUF.W( xRegisterMMX(to), xRegisterMMX(from), imm8 ); }
-emitterT void PSHUFWMtoR(x86MMXRegType to, uptr from, u8 imm8)				{ xPSHUF.W( xRegisterMMX(to), (void*)from, imm8 ); }
-
-emitterT void PINSRWRtoMMX( x86MMXRegType to, x86SSERegType from, u8 imm8 ) { xPINSR.W( xRegisterMMX(to), xRegister32(from), imm8 ); }
-
-emitterT void EMMS() { xEMMS(); }
--- a/pcsx2/x86/ix86/ix86_legacy_sse.cpp
+++ b/pcsx2/x86/ix86/ix86_legacy_sse.cpp
@ -22,35 +22,109 @@

 using namespace x86Emitter;

+// ------------------------------------------------------------------------
+//                         MMX / SSE Mixed Bag
+// ------------------------------------------------------------------------

-//------------------------------------------------------------------
-// SSE instructions
-//------------------------------------------------------------------
+emitterT void MOVQMtoR( x86MMXRegType to, uptr from )							{ xMOVQ( xRegisterMMX(to), (void*)from ); }
+emitterT void MOVQRtoM( uptr to, x86MMXRegType from )							{ xMOVQ( (void*)to, xRegisterMMX(from) ); }
+emitterT void MOVQRtoR( x86MMXRegType to, x86MMXRegType from )					{ xMOVQ( xRegisterMMX(to), xRegisterMMX(from) ); }
+emitterT void MOVQRmtoR( x86MMXRegType to, x86IntRegType from, int offset )		{ xMOVQ( xRegisterMMX(to), ptr[xAddressReg(from)+offset] ); }
+emitterT void MOVQRtoRm( x86IntRegType to, x86MMXRegType from, int offset )		{ xMOVQ( ptr[xAddressReg(to)+offset], xRegisterMMX(from) ); }

-#define SSEMtoR( code, overb ) \
-	assert( to < iREGCNT_XMM ), \
-	RexR(0, to),             \
-	write16( code ), \
-	ModRM( 0, to, DISP32 ), \
-	write32( MEMADDR(from, 4 + overb) )
+emitterT void MOVDMtoMMX( x86MMXRegType to, uptr from )							{ xMOVDZX( xRegisterMMX(to), (void*)from ); }
+emitterT void MOVDMMXtoM( uptr to, x86MMXRegType from )							{ xMOVD( (void*)to, xRegisterMMX(from) ); }
+emitterT void MOVD32RtoMMX( x86MMXRegType to, x86IntRegType from )				{ xMOVDZX( xRegisterMMX(to), xRegister32(from) ); }
+emitterT void MOVD32RmtoMMX( x86MMXRegType to, x86IntRegType from, int offset )	{ xMOVDZX( xRegisterMMX(to), ptr[xAddressReg(from)+offset] ); }
+emitterT void MOVD32MMXtoR( x86IntRegType to, x86MMXRegType from )				{ xMOVD( xRegister32(to), xRegisterMMX(from) ); }
+emitterT void MOVD32MMXtoRm( x86IntRegType to, x86MMXRegType from, int offset )	{ xMOVD( ptr[xAddressReg(to)+offset], xRegisterMMX(from) ); }

-#define SSERtoR( code ) \
-	assert( to < iREGCNT_XMM && from < iREGCNT_XMM), \
-    RexRB(0, to, from),            \
-	write16( code ), \
-	ModRM( 3, to, from )
+emitterT void PMOVMSKBMMXtoR(x86IntRegType to, x86MMXRegType from)				{ xPMOVMSKB( xRegister32(to), xRegisterMMX(from) ); }
+emitterT void MASKMOVQRtoR(x86MMXRegType to, x86MMXRegType from)				{ xMASKMOV( xRegisterMMX(to), xRegisterMMX(from) ); }

-#define SSEMtoR66( code ) \
-	write8( 0x66 ), \
-	SSEMtoR( code, 0 )
+#define DEFINE_LEGACY_LOGIC_OPCODE( mod ) \
+	emitterT void P##mod##RtoR( x86MMXRegType to, x86MMXRegType from )				{ xP##mod( xRegisterMMX(to), xRegisterMMX(from) ); } \
+	emitterT void P##mod##MtoR( x86MMXRegType to, uptr from )						{ xP##mod( xRegisterMMX(to), (void*)from ); } \
+	emitterT void SSE2_P##mod##_XMM_to_XMM( x86SSERegType to, x86SSERegType from )	{ xP##mod( xRegisterSSE(to), xRegisterSSE(from) ); } \
+	emitterT void SSE2_P##mod##_M128_to_XMM( x86SSERegType to, uptr from )			{ xP##mod( xRegisterSSE(to), (void*)from ); }

-#define SSERtoM66( code ) \
-	write8( 0x66 ), \
-	SSERtoM( code, 0 )
+#define DEFINE_LEGACY_ARITHMETIC( mod, sub ) \
+	emitterT void P##mod##sub##RtoR( x86MMXRegType to, x86MMXRegType from )				{ xP##mod.sub( xRegisterMMX(to), xRegisterMMX(from) ); } \
+	emitterT void P##mod##sub##MtoR( x86MMXRegType to, uptr from )						{ xP##mod.sub( xRegisterMMX(to), (void*)from ); } \
+	emitterT void SSE2_P##mod##sub##_XMM_to_XMM( x86SSERegType to, x86SSERegType from )	{ xP##mod.sub( xRegisterSSE(to), xRegisterSSE(from) ); } \
+	emitterT void SSE2_P##mod##sub##_M128_to_XMM( x86SSERegType to, uptr from )			{ xP##mod.sub( xRegisterSSE(to), (void*)from ); }

-#define SSERtoR66( code ) \
-	write8( 0x66 ), \
-	SSERtoR( code )
+#define DEFINE_LEGACY_SHIFT_STUFF( mod, sub ) \
+	emitterT void P##mod##sub##RtoR( x86MMXRegType to, x86MMXRegType from )				{ xP##mod.sub( xRegisterMMX(to), xRegisterMMX(from) ); } \
+	emitterT void P##mod##sub##MtoR( x86MMXRegType to, uptr from )						{ xP##mod.sub( xRegisterMMX(to), (void*)from ); } \
+	emitterT void P##mod##sub##ItoR( x86MMXRegType to, u8 imm )							{ xP##mod.sub( xRegisterMMX(to), imm ); } \
+	emitterT void SSE2_P##mod##sub##_XMM_to_XMM( x86SSERegType to, x86SSERegType from )	{ xP##mod.sub( xRegisterSSE(to), xRegisterSSE(from) ); } \
+	emitterT void SSE2_P##mod##sub##_M128_to_XMM( x86SSERegType to, uptr from )			{ xP##mod.sub( xRegisterSSE(to), (void*)from ); } \
+	emitterT void SSE2_P##mod##sub##_I8_to_XMM( x86SSERegType to, u8 imm )				{ xP##mod.sub( xRegisterSSE(to), imm ); }
+
+#define DEFINE_LEGACY_SHIFT_OPCODE( mod ) \
+	DEFINE_LEGACY_SHIFT_STUFF( mod, Q ) \
+	DEFINE_LEGACY_SHIFT_STUFF( mod, D ) \
+	DEFINE_LEGACY_SHIFT_STUFF( mod, W ) \
+	emitterT void SSE2_P##mod##DQ_I8_to_XMM( x86MMXRegType to, u8 imm )					{ xP##mod.DQ( xRegisterSSE(to), imm ); }
+
+DEFINE_LEGACY_LOGIC_OPCODE( AND )
+DEFINE_LEGACY_LOGIC_OPCODE( ANDN )
+DEFINE_LEGACY_LOGIC_OPCODE( OR )
+DEFINE_LEGACY_LOGIC_OPCODE( XOR )
+
+DEFINE_LEGACY_SHIFT_OPCODE( SLL )
+DEFINE_LEGACY_SHIFT_OPCODE( SRL )
+DEFINE_LEGACY_SHIFT_STUFF( SRA, D )
+DEFINE_LEGACY_SHIFT_STUFF( SRA, W )
+
+DEFINE_LEGACY_ARITHMETIC( ADD, B )
+DEFINE_LEGACY_ARITHMETIC( ADD, W )
+DEFINE_LEGACY_ARITHMETIC( ADD, D )
+DEFINE_LEGACY_ARITHMETIC( ADD, Q )
+DEFINE_LEGACY_ARITHMETIC( ADD, SB )
+DEFINE_LEGACY_ARITHMETIC( ADD, SW )
+DEFINE_LEGACY_ARITHMETIC( ADD, USB )
+DEFINE_LEGACY_ARITHMETIC( ADD, USW )
+
+DEFINE_LEGACY_ARITHMETIC( SUB, B )
+DEFINE_LEGACY_ARITHMETIC( SUB, W )
+DEFINE_LEGACY_ARITHMETIC( SUB, D )
+DEFINE_LEGACY_ARITHMETIC( SUB, Q )
+DEFINE_LEGACY_ARITHMETIC( SUB, SB )
+DEFINE_LEGACY_ARITHMETIC( SUB, SW )
+DEFINE_LEGACY_ARITHMETIC( SUB, USB )
+DEFINE_LEGACY_ARITHMETIC( SUB, USW )
+
+DEFINE_LEGACY_ARITHMETIC( CMP, EQB );
+DEFINE_LEGACY_ARITHMETIC( CMP, EQW );
+DEFINE_LEGACY_ARITHMETIC( CMP, EQD );
+DEFINE_LEGACY_ARITHMETIC( CMP, GTB );
+DEFINE_LEGACY_ARITHMETIC( CMP, GTW );
+DEFINE_LEGACY_ARITHMETIC( CMP, GTD );
+
+DEFINE_LEGACY_ARITHMETIC( UNPCK, HDQ );
+DEFINE_LEGACY_ARITHMETIC( UNPCK, LDQ );
+DEFINE_LEGACY_ARITHMETIC( UNPCK, HBW );
+DEFINE_LEGACY_ARITHMETIC( UNPCK, LBW );
+
+DEFINE_LEGACY_ARITHMETIC( UNPCK, LWD );
+DEFINE_LEGACY_ARITHMETIC( UNPCK, HWD );
+
+
+emitterT void PMULUDQMtoR( x86MMXRegType to, uptr from )					{ xPMUL.UDQ( xRegisterMMX( to ), (void*)from ); }
+emitterT void PMULUDQRtoR( x86MMXRegType to, x86MMXRegType from )			{ xPMUL.UDQ( xRegisterMMX( to ), xRegisterMMX( from ) ); }
+
+emitterT void PSHUFWRtoR(x86MMXRegType to, x86MMXRegType from, u8 imm8)		{ xPSHUF.W( xRegisterMMX(to), xRegisterMMX(from), imm8 ); }
+emitterT void PSHUFWMtoR(x86MMXRegType to, uptr from, u8 imm8)				{ xPSHUF.W( xRegisterMMX(to), (void*)from, imm8 ); }
+
+emitterT void PINSRWRtoMMX( x86MMXRegType to, x86SSERegType from, u8 imm8 ) { xPINSR.W( xRegisterMMX(to), xRegister32(from), imm8 ); }
+
+emitterT void EMMS() { xEMMS(); }
+
+// ------------------------------------------------------------------------
+//                         Begin SSE-Only Part!
+// ------------------------------------------------------------------------

 #define DEFINE_LEGACY_MOV_OPCODE( mod, sse ) \
 	emitterT void sse##_MOV##mod##_M128_to_XMM( x86SSERegType to, uptr from )	{ xMOV##mod( xRegisterSSE(to), (void*)from ); } \
@ -290,73 +364,17 @@ emitterT void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 im
 emitterT void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8)	{ xINSERTPS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); }
 emitterT void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8)	{ xEXTRACTPS( xRegister32(to), xRegisterSSE(from), imm8 ); }

+emitterT void SSE4_DPPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8)		{ xDP.PS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); }
+emitterT void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8)				{ xDP.PS( xRegisterSSE(to), (void*)from, imm8 ); }
+
+emitterT void SSE4_BLENDPS_XMM_to_XMM(x86IntRegType to, x86SSERegType from, u8 imm8)	{ xBLEND.PS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); }
+emitterT void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from)			{ xBLEND.VPS( xRegisterSSE(to), xRegisterSSE(from) ); }
+emitterT void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from)					{ xBLEND.VPS( xRegisterSSE(to), (void*)from ); }
+
+emitterT void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from)			{ xPMOVSX.DQ( xRegisterSSE(to), xRegisterSSE(from) ); }

 emitterT void SSE_LDMXCSR( uptr from ) { xLDMXCSR( (u32*)from ); }

-//////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////
-
-
-// SSE4.1
-
-emitterT void SSE4_DPPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8) 
-{
-	write8(0x66);
-	write24(0x403A0F);
-	ModRM(3, to, from);
-	write8(imm8);
-}
-
-emitterT void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8)
-{
-	write8(0x66);
-	write24(0x403A0F);
-	ModRM(0, to, DISP32);
-	write32(MEMADDR(from, 4));
-	write8(imm8);
-}
-
-emitterT void SSE4_BLENDPS_XMM_to_XMM(x86IntRegType to, x86SSERegType from, u8 imm8)
-{
-	write8(0x66);
-    RexRB(0, to, from);
-	write24(0x0C3A0F);
-	ModRM(3, to, from);
-	write8(imm8);
-}
-
-emitterT void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
-{
-	write8(0x66);
-    RexRB(0, to, from);
-	write24(0x14380F);
-	ModRM(3, to, from);
-}
-
-emitterT void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from)
-{
-	write8(0x66);
-    RexR(0, to);
-	write24(0x14380F);
-	ModRM(0, to, DISP32);
-	write32(MEMADDR(from, 4));
-}
-
-emitterT void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
-{
-	write8(0x66);
-    RexRB(0, to, from);
-	write24(0x25380F);
-	ModRM(3, to, from);
-}
-
-emitterT void SSE4_PMOVZXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
-{
-	write8(0x66);
-    RexRB(0, to, from);
-	write24(0x35380F);
-	ModRM(3, to, from);
-}

 //////////////////////////////////////////////////////////////////////////////////////////
 // SSE-X Helpers (generates either INT or FLOAT versions of certain SSE instructions)
--- a/pcsx2/x86/ix86/ix86_simd.cpp
+++ b/pcsx2/x86/ix86/ix86_simd.cpp
@ -0,0 +1,388 @@
+/*  Pcsx2 - Pc Ps2 Emulator
+ *  Copyright (C) 2002-2009  Pcsx2 Team
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *  
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
+#include "PrecompiledHeader.h"
+
+#include "System.h"
+#include "ix86_internal.h"
+
+namespace x86Emitter {
+
+using namespace Internal;
+
+// ------------------------------------------------------------------------
+// SimdPrefix - If the lower byte of the opcode is 0x38 or 0x3a, then the opcode is
+// treated as a 16 bit value (in SSE 0x38 and 0x3a denote prefixes for extended SSE3/4
+// instructions).  Any other lower value assumes the upper value is 0 and ignored.
+// Non-zero upper bytes, when the lower byte is not the 0x38 or 0x3a prefix, will
+// generate an assertion.
+//
+__emitinline void Internal::SimdPrefix( u8 prefix, u16 opcode )
+{
+	const bool is16BitOpcode = ((opcode & 0xff) == 0x38) || ((opcode & 0xff) == 0x3a);
+
+	// If the lower byte is not a valid previx and the upper byte is non-zero it
+	// means we made a mistake!
+	if( !is16BitOpcode ) jASSUME( (opcode >> 8) == 0 );
+
+	if( prefix != 0 )
+	{
+		if( is16BitOpcode )
+			xWrite<u32>( (opcode<<16) | 0x0f00 | prefix );
+		else
+		{
+			xWrite<u16>( 0x0f00 | prefix );
+			xWrite<u8>( opcode );
+		}
+	}
+	else
+	{
+		if( is16BitOpcode )
+		{
+			xWrite<u8>( 0x0f );
+			xWrite<u16>( opcode );
+		}
+		else
+			xWrite<u16>( (opcode<<8) | 0x0f );
+	}
+}
+
+// [SSE-3]
+const SimdImpl_DestRegSSE<0xf3,0x12> xMOVSLDUP;
+// [SSE-3]
+const SimdImpl_DestRegSSE<0xf3,0x16> xMOVSHDUP;
+
+const SimdImpl_MoveSSE<0x00,true> xMOVAPS;
+
+// Note: All implementations of Unaligned Movs will, when possible, use aligned movs instead.
+// This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
+// which can be checked for alignment at runtime.
+const SimdImpl_MoveSSE<0x00,false> xMOVUPS;
+
+#ifdef ALWAYS_USE_MOVAPS
+const SimdImpl_MoveSSE<0,true> xMOVDQA;
+const SimdImpl_MoveSSE<0,true> xMOVAPD;
+
+// Note: All implementations of Unaligned Movs will, when possible, use aligned movs instead.
+// This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
+// which can be checked for alignment at runtime.
+const SimdImpl_MoveSSE<0,false> xMOVDQU;
+const SimdImpl_MoveSSE<0,false> xMOVUPD;
+#else
+const SimdImpl_MoveDQ<0x66, 0x6f, 0x7f> xMOVDQA;
+const SimdImpl_MoveDQ<0xf3, 0x6f, 0x7f> xMOVDQU;
+const SimdImpl_MoveSSE<0x66,true> xMOVAPD;
+const SimdImpl_MoveSSE<0x66,false> xMOVUPD;
+#endif
+
+const MovhlImplAll<0x16> xMOVH;
+const MovhlImplAll<0x12> xMOVL;
+const MovhlImpl_RtoR<0x16> xMOVLH;
+const MovhlImpl_RtoR<0x12> xMOVHL;
+
+const SimdImpl_DestRegEither<0x66,0xdb> xPAND;
+const SimdImpl_DestRegEither<0x66,0xdf> xPANDN;
+const SimdImpl_DestRegEither<0x66,0xeb> xPOR;
+const SimdImpl_DestRegEither<0x66,0xef> xPXOR;
+
+const SimdImpl_AndNot xANDN;
+
+const SimdImpl_UcomI<0x66,0x2e> xUCOMI;
+const SimdImpl_rSqrt<0x53> xRCP;
+const SimdImpl_rSqrt<0x52> xRSQRT;
+const SimdImpl_Sqrt<0x51> xSQRT;
+
+const SimdImpl_MinMax<0x5f> xMAX;
+const SimdImpl_MinMax<0x5d> xMIN;
+const SimdImpl_Shuffle<0xc6> xSHUF;
+
+// ------------------------------------------------------------------------
+
+// [SSE-4.1] Performs a bitwise AND of dest against src, and sets the ZF flag
+// only if all bits in the result are 0.  PTEST also sets the CF flag according
+// to the following condition: (xmm2/m128 AND NOT xmm1) == 0;
+extern const Internal::SimdImpl_DestRegSSE<0x66,0x1738> xPTEST;
+
+const SimdImpl_Compare<SSE2_Equal>			xCMPEQ;
+const SimdImpl_Compare<SSE2_Less>			xCMPLT;
+const SimdImpl_Compare<SSE2_LessOrEqual>	xCMPLE;
+const SimdImpl_Compare<SSE2_Unordered>		xCMPUNORD;
+const SimdImpl_Compare<SSE2_NotEqual>		xCMPNE;
+const SimdImpl_Compare<SSE2_NotLess>		xCMPNLT;
+const SimdImpl_Compare<SSE2_NotLessOrEqual> xCMPNLE;
+const SimdImpl_Compare<SSE2_Ordered>		xCMPORD;
+
+// ------------------------------------------------------------------------
+// SSE Conversion Operations, as looney as they are.
+// 
+// These enforce pointer strictness for Indirect forms, due to the otherwise completely confusing
+// nature of the functions.  (so if a function expects an m32, you must use (u32*) or ptr32[]).
+//
+const SimdImpl_DestRegStrict<0xf3,0xe6,xRegisterSSE,xRegisterSSE,u64>		xCVTDQ2PD;
+const SimdImpl_DestRegStrict<0x00,0x5b,xRegisterSSE,xRegisterSSE,u128>		xCVTDQ2PS;
+
+const SimdImpl_DestRegStrict<0xf2,0xe6,xRegisterSSE,xRegisterSSE,u128>		xCVTPD2DQ;
+const SimdImpl_DestRegStrict<0x66,0x2d,xRegisterMMX,xRegisterSSE,u128>		xCVTPD2PI;
+const SimdImpl_DestRegStrict<0x66,0x5a,xRegisterSSE,xRegisterSSE,u128>		xCVTPD2PS;
+
+const SimdImpl_DestRegStrict<0x66,0x2a,xRegisterSSE,xRegisterMMX,u64>		xCVTPI2PD;
+const SimdImpl_DestRegStrict<0x00,0x2a,xRegisterSSE,xRegisterMMX,u64>		xCVTPI2PS;
+
+const SimdImpl_DestRegStrict<0x66,0x5b,xRegisterSSE,xRegisterSSE,u128>		xCVTPS2DQ;
+const SimdImpl_DestRegStrict<0x00,0x5a,xRegisterSSE,xRegisterSSE,u64>		xCVTPS2PD;
+const SimdImpl_DestRegStrict<0x00,0x2d,xRegisterMMX,xRegisterSSE,u64>		xCVTPS2PI;
+
+const SimdImpl_DestRegStrict<0xf2,0x2d,xRegister32, xRegisterSSE,u64>		xCVTSD2SI;
+const SimdImpl_DestRegStrict<0xf2,0x5a,xRegisterSSE,xRegisterSSE,u64>		xCVTSD2SS;
+const SimdImpl_DestRegStrict<0xf2,0x2a,xRegisterMMX,xRegister32, u32>		xCVTSI2SD;
+const SimdImpl_DestRegStrict<0xf3,0x2a,xRegisterSSE,xRegister32, u32>		xCVTSI2SS;
+
+const SimdImpl_DestRegStrict<0xf3,0x5a,xRegisterSSE,xRegisterSSE,u32>		xCVTSS2SD;
+const SimdImpl_DestRegStrict<0xf3,0x2d,xRegister32, xRegisterSSE,u32>		xCVTSS2SI;
+
+const SimdImpl_DestRegStrict<0x66,0xe6,xRegisterSSE,xRegisterSSE,u128>		xCVTTPD2DQ;
+const SimdImpl_DestRegStrict<0x66,0x2c,xRegisterMMX,xRegisterSSE,u128>		xCVTTPD2PI;
+const SimdImpl_DestRegStrict<0xf3,0x5b,xRegisterSSE,xRegisterSSE,u128>		xCVTTPS2DQ;
+const SimdImpl_DestRegStrict<0x00,0x2c,xRegisterMMX,xRegisterSSE,u64>		xCVTTPS2PI;
+
+const SimdImpl_DestRegStrict<0xf2,0x2c,xRegister32, xRegisterSSE,u64>		xCVTTSD2SI;
+const SimdImpl_DestRegStrict<0xf3,0x2c,xRegister32, xRegisterSSE,u32>		xCVTTSS2SI;
+
+// ------------------------------------------------------------------------
+
+const SimdImpl_Shift<0xd0, 2> xPSRL;
+const SimdImpl_Shift<0xf0, 6> xPSLL;
+const SimdImpl_ShiftWithoutQ<0xe0, 4> xPSRA;
+
+const SimdImpl_AddSub<0xdc, 0xd4> xPADD;
+const SimdImpl_AddSub<0xd8, 0xfb> xPSUB;
+const SimdImpl_PMinMax<0xde,0x3c> xPMAX;
+const SimdImpl_PMinMax<0xda,0x38> xPMIN;
+
+const SimdImpl_PMul xPMUL;
+const SimdImpl_PCompare xPCMP;
+const SimdImpl_PShuffle xPSHUF;
+const SimdImpl_PUnpack xPUNPCK;
+const SimdImpl_Unpack xUNPCK;
+const SimdImpl_Pack xPACK;
+
+const SimdImpl_PAbsolute xPABS;
+const SimdImpl_PSign xPSIGN;
+const SimdImpl_PInsert xPINSR;
+const SimdImpl_PExtract xPEXTR;
+const SimdImpl_PMultAdd xPMADD;
+const SimdImpl_HorizAdd xHADD;
+
+const SimdImpl_Blend xBLEND;
+const SimdImpl_DotProduct xDP;
+const SimdImpl_Round xROUND;
+
+const SimdImpl_PMove<true> xPMOVSX;
+const SimdImpl_PMove<false> xPMOVZX;
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+//
+
+__emitinline void xEMMS()
+{
+	xWrite<u16>( 0x770F );
+}
+
+// Store Streaming SIMD Extension Control/Status to Mem32.
+__emitinline void xSTMXCSR( u32* dest )
+{
+	SimdPrefix( 0, 0xae );
+	xWriteDisp( 3, dest );
+}
+
+// Load Streaming SIMD Extension Control/Status from Mem32.
+__emitinline void xLDMXCSR( const u32* src )
+{
+	SimdPrefix( 0, 0xae );
+	xWriteDisp( 2, src );
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// MMX Mov Instructions (MOVD, MOVQ, MOVSS).
+//
+// Notes:
+//  * Some of the functions have been renamed to more clearly reflect what they actually
+//    do.  Namely we've affixed "ZX" to several MOVs that take a register as a destination
+//    since that's what they do (MOVD clears upper 32/96 bits, etc).
+//
+//  * MOVD has valid forms for MMX and XMM registers.
+//
+
+__forceinline void xMOVDZX( const xRegisterSSE& to, const xRegister32& from )	{ xOpWrite0F( 0x66, 0x6e, to, from ); }
+__forceinline void xMOVDZX( const xRegisterSSE& to, const void* src )			{ xOpWrite0F( 0x66, 0x6e, to, src ); }
+__forceinline void xMOVDZX( const xRegisterSSE& to, const ModSibBase& src )		{ xOpWrite0F( 0x66, 0x6e, to, src ); }
+
+__forceinline void xMOVDZX( const xRegisterMMX& to, const xRegister32& from )	{ xOpWrite0F( 0x6e, to, from ); }
+__forceinline void xMOVDZX( const xRegisterMMX& to, const void* src )			{ xOpWrite0F( 0x6e, to, src ); }
+__forceinline void xMOVDZX( const xRegisterMMX& to, const ModSibBase& src )		{ xOpWrite0F( 0x6e, to, src ); }
+
+__forceinline void xMOVD( const xRegister32& to, const xRegisterSSE& from )		{ xOpWrite0F( 0x66, 0x7e, from, to ); }
+__forceinline void xMOVD( void* dest, const xRegisterSSE& from )				{ xOpWrite0F( 0x66, 0x7e, from, dest ); }
+__forceinline void xMOVD( const ModSibBase& dest, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0x7e, from, dest ); }
+
+__forceinline void xMOVD( const xRegister32& to, const xRegisterMMX& from )		{ xOpWrite0F( 0x7e, from, to ); }
+__forceinline void xMOVD( void* dest, const xRegisterMMX& from )				{ xOpWrite0F( 0x7e, from, dest ); }
+__forceinline void xMOVD( const ModSibBase& dest, const xRegisterMMX& from )	{ xOpWrite0F( 0x7e, from, dest ); }
+
+
+// Moves from XMM to XMM, with the *upper 64 bits* of the destination register
+// being cleared to zero.
+__forceinline void xMOVQZX( const xRegisterSSE& to, const xRegisterSSE& from )	{ xOpWrite0F( 0xf3, 0x7e, to, from ); }
+
+// Moves from XMM to XMM, with the *upper 64 bits* of the destination register
+// being cleared to zero.
+__forceinline void xMOVQZX( const xRegisterSSE& to, const ModSibBase& src )		{ xOpWrite0F( 0xf3, 0x7e, to, src ); }
+
+// Moves from XMM to XMM, with the *upper 64 bits* of the destination register
+// being cleared to zero.
+__forceinline void xMOVQZX( const xRegisterSSE& to, const void* src )			{ xOpWrite0F( 0xf3, 0x7e, to, src ); }
+
+// Moves lower quad of XMM to ptr64 (no bits are cleared)
+__forceinline void xMOVQ( const ModSibBase& dest, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0xd6, from, dest ); }
+// Moves lower quad of XMM to ptr64 (no bits are cleared)
+__forceinline void xMOVQ( void* dest, const xRegisterSSE& from )				{ xOpWrite0F( 0x66, 0xd6, from, dest ); }
+
+__forceinline void xMOVQ( const xRegisterMMX& to, const xRegisterMMX& from )	{ if( to != from ) xOpWrite0F( 0x6f, to, from ); }
+__forceinline void xMOVQ( const xRegisterMMX& to, const ModSibBase& src )		{ xOpWrite0F( 0x6f, to, src ); }
+__forceinline void xMOVQ( const xRegisterMMX& to, const void* src )				{ xOpWrite0F( 0x6f, to, src ); }
+__forceinline void xMOVQ( const ModSibBase& dest, const xRegisterMMX& from )	{ xOpWrite0F( 0x7f, from, dest ); }
+__forceinline void xMOVQ( void* dest, const xRegisterMMX& from )				{ xOpWrite0F( 0x7f, from, dest ); }
+
+// This form of xMOVQ is Intel's adeptly named 'MOVQ2DQ'
+__forceinline void xMOVQ( const xRegisterSSE& to, const xRegisterMMX& from )	{ xOpWrite0F( 0xf3, 0xd6, to, from ); }
+
+// This form of xMOVQ is Intel's adeptly named 'MOVDQ2Q'
+__forceinline void xMOVQ( const xRegisterMMX& to, const xRegisterSSE& from )
+{
+	// Manual implementation of this form of MOVQ, since its parameters are unique in a way
+	// that breaks the template inference of writeXMMop();
+
+	SimdPrefix( 0xf2, 0xd6 );
+	ModRM_Direct( to.Id, from.Id );
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+//
+
+#define IMPLEMENT_xMOVS( ssd, prefix ) \
+	__forceinline void xMOV##ssd( const xRegisterSSE& to, const xRegisterSSE& from )	{ if( to != from ) xOpWrite0F( prefix, 0x10, to, from ); } \
+	__forceinline void xMOV##ssd##ZX( const xRegisterSSE& to, const void* from )		{ xOpWrite0F( prefix, 0x10, to, from ); } \
+	__forceinline void xMOV##ssd##ZX( const xRegisterSSE& to, const ModSibBase& from )	{ xOpWrite0F( prefix, 0x10, to, from ); } \
+	__forceinline void xMOV##ssd( const void* to, const xRegisterSSE& from )			{ xOpWrite0F( prefix, 0x11, from, to ); } \
+	__forceinline void xMOV##ssd( const ModSibBase& to, const xRegisterSSE& from )		{ xOpWrite0F( prefix, 0x11, from, to ); }
+
+IMPLEMENT_xMOVS( SS, 0xf3 )
+IMPLEMENT_xMOVS( SD, 0xf2 )
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Non-temporal movs only support a register as a target (ie, load form only, no stores)
+//
+
+__forceinline void xMOVNTDQA( const xRegisterSSE& to, const void* from )
+{
+	xWrite<u32>( 0x2A380f66 );
+	xWriteDisp( to.Id, from );
+}
+
+__forceinline void xMOVNTDQA( const xRegisterSSE& to, const ModSibBase& from )
+{
+	xWrite<u32>( 0x2A380f66 );
+	EmitSibMagic( to.Id, from );
+}
+
+__forceinline void xMOVNTDQ( void* to, const xRegisterSSE& from )				{ xOpWrite0F( 0x66, 0xe7, from, to ); }
+__forceinline void xMOVNTDQA( const ModSibBase& to, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0xe7, from, to ); }
+
+__forceinline void xMOVNTPD( void* to, const xRegisterSSE& from )				{ xOpWrite0F( 0x66, 0x2b, from, to ); }
+__forceinline void xMOVNTPD( const ModSibBase& to, const xRegisterSSE& from )	{ xOpWrite0F( 0x66, 0x2b, from, to ); }
+__forceinline void xMOVNTPS( void* to, const xRegisterSSE& from )				{ xOpWrite0F( 0x2b, from, to ); }
+__forceinline void xMOVNTPS( const ModSibBase& to, const xRegisterSSE& from )	{ xOpWrite0F( 0x2b, from, to ); }
+
+__forceinline void xMOVNTQ( void* to, const xRegisterMMX& from )				{ xOpWrite0F( 0xe7, from, to ); }
+__forceinline void xMOVNTQ( const ModSibBase& to, const xRegisterMMX& from )	{ xOpWrite0F( 0xe7, from, to ); }
+
+// ------------------------------------------------------------------------
+
+__forceinline void xMOVMSKPS( const xRegister32& to, const xRegisterSSE& from)	{ xOpWrite0F( 0x50, to, from ); }
+__forceinline void xMOVMSKPD( const xRegister32& to, const xRegisterSSE& from)	{ xOpWrite0F( 0x66, 0x50, to, from, true ); }
+
+// xMASKMOV:
+// Selectively write bytes from mm1/xmm1 to memory location using the byte mask in mm2/xmm2.
+// The default memory location is specified by DS:EDI.  The most significant bit in each byte
+// of the mask operand determines whether the corresponding byte in the source operand is
+// written to the corresponding byte location in memory.
+__forceinline void xMASKMOV( const xRegisterSSE& to, const xRegisterSSE& from )		{ xOpWrite0F( 0x66, 0xf7, to, from ); }
+__forceinline void xMASKMOV( const xRegisterMMX& to, const xRegisterMMX& from )		{ xOpWrite0F( 0xf7, to, from ); }
+
+// xPMOVMSKB:
+// Creates a mask made up of the most significant bit of each byte of the source 
+// operand and stores the result in the low byte or word of the destination operand.
+// Upper bits of the destination are cleared to zero.
+//
+// When operating on a 64-bit (MMX) source, the byte mask is 8 bits; when operating on
+// 128-bit (SSE) source, the byte mask is 16-bits.
+//
+__forceinline void xPMOVMSKB( const xRegister32& to, const xRegisterSSE& from )		{ xOpWrite0F( 0x66, 0xd7, to, from ); }
+__forceinline void xPMOVMSKB( const xRegister32& to, const xRegisterMMX& from )		{ xOpWrite0F( 0xd7, to, from ); }
+
+// [sSSE-3] Concatenates dest and source operands into an intermediate composite,
+// shifts the composite at byte granularity to the right by a constant immediate,
+// and extracts the right-aligned result into the destination.
+//
+__forceinline void xPALIGNR( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 )	{ xOpWrite0F( 0x66, 0x0f3a, to, from, imm8 ); }
+__forceinline void xPALIGNR( const xRegisterMMX& to, const xRegisterMMX& from, u8 imm8 )	{ xOpWrite0F( 0x0f3a, to, from, imm8 ); }
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// INSERTPS / EXTRACTPS   [SSE4.1 only!]
+//
+// [TODO] these might be served better as classes, especially if other instructions use
+// the M32,sse,imm form (I forget offhand if any do).
+
+
+// [SSE-4.1] Insert a single-precision floating-point value from src into a specified
+// location in dest, and selectively zero out the data elements in dest according to
+// the mask  field in the immediate byte. The source operand can be a memory location
+// (32 bits) or an XMM register (lower 32 bits used).
+//
+// Imm8 provides three fields:
+//  * COUNT_S: The value of Imm8[7:6] selects the dword element from src.  It is 0 if
+//    the source is a memory operand.
+//  * COUNT_D: The value of Imm8[5:4] selects the target dword element in dest.
+//  * ZMASK: Each bit of Imm8[3:0] selects a dword element in dest to  be written
+//    with 0.0 if set to 1.
+//
+__emitinline void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 )		{ xOpWrite0F( 0x66, 0x213a, to, from, imm8 ); }
+__emitinline void xINSERTPS( const xRegisterSSE& to, const u32* from, u8 imm8 )					{ xOpWrite0F( 0x66, 0x213a, to, from, imm8 ); }
+__emitinline void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 )	{ xOpWrite0F( 0x66, 0x213a, to, from, imm8 ); }
+
+// [SSE-4.1] Extract a single-precision floating-point value from src at an offset
+// determined by imm8[1-0]*32. The extracted single precision floating-point value
+// is stored into the low 32-bits of dest (or at a 32-bit memory pointer).
+//
+__emitinline void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 )		{ xOpWrite0F( 0x66, 0x173a, to, from, imm8 ); }
+__emitinline void xEXTRACTPS( u32* dest, const xRegisterSSE& from, u8 imm8 )					{ xOpWrite0F( 0x66, 0x173a, from, dest, imm8 ); }
+__emitinline void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 ){ xOpWrite0F( 0x66, 0x173a, from, dest, imm8 ); }
+
+}
--- a/pcsx2/x86/ix86/ix86_sse_helpers.h
+++ b/pcsx2/x86/ix86/ix86_sse_helpers.h
@ -48,4 +48,3 @@ extern void SSEX_PUNPCKLDQ_M128_to_XMM(x86SSERegType to, uptr from );
 extern void SSEX_PUNPCKLDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from );
 extern void SSEX_PUNPCKHDQ_M128_to_XMM(x86SSERegType to, uptr from );
 extern void SSEX_PUNPCKHDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from );
-extern void SSEX_MOVHLPS_XMM_to_XMM( x86SSERegType to, x86SSERegType from );
--- a/pcsx2/x86/ix86/ix86_types.h
+++ b/pcsx2/x86/ix86/ix86_types.h
@ -252,28 +252,6 @@ namespace x86Emitter
 		}
 	};

-	//////////////////////////////////////////////////////////////////////////////////////////
-	//
-	template< typename OperandType >
-	class xRegisterSIMD : public xRegister<OperandType>
-	{
-	public:
-		static const xRegisterSIMD Empty;		// defined as an empty/unused value (-1)
-
-	public:
-		xRegisterSIMD(): xRegister<OperandType>() {}
-		xRegisterSIMD( const xRegisterSIMD& src ) : xRegister<OperandType>( src.Id ) {}
-		xRegisterSIMD( const xRegister<OperandType>& src ) : xRegister<OperandType>( src ) {}
-		explicit xRegisterSIMD( int regId ) : xRegister<OperandType>( regId ) {}
-
-		xRegisterSIMD<OperandType>& operator=( const xRegisterSIMD<OperandType>& src )
-		{
-			xRegister<OperandType>::Id = src.Id;
-			return *this;
-		}
-	};
-
-	
 	// ------------------------------------------------------------------------
 	// Note: GCC parses templates ahead of time apparently as a 'favor' to the programmer, which
 	// means it finds undeclared variables when MSVC does not (Since MSVC compiles templates
@ -282,8 +260,8 @@ namespace x86Emitter
 	// all about the the templated code in haphazard fashion.  Yay.. >_<
 	//

-	typedef xRegisterSIMD<u128> xRegisterSSE;
-	typedef xRegisterSIMD<u64>  xRegisterMMX;
+	typedef xRegister<u128> xRegisterSSE;
+	typedef xRegister<u64>  xRegisterMMX;
 	typedef xRegister<u32>  xRegister32;
 	typedef xRegister<u16>  xRegister16;
 	typedef xRegister<u8>   xRegister8;