Emitter Rewrite, Part 3 of 5: Finished all SIMD instructions, except those embedded into base instruction groups (CMPSS/SD, DIVSS/SD, etc).

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2135 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-11-05 23:39:45 +00:00 · 2009-11-05 23:39:45 +00:00 · fcdb429bb9
parent 04c86ea6d3
commit fcdb429bb9
10 changed files with 570 additions and 534 deletions
--- a/common/build/x86emitter/x86emitter.vcproj
+++ b/common/build/x86emitter/x86emitter.vcproj
@ -332,14 +332,6 @@
 			<Filter
 				Name="Implement_Simd"
 				>
 				<File
 					RelativePath="..\..\include\x86emitter\implement\xmm\moremovs.h"
 					>
 				</File>
 				<File
 					RelativePath="..\..\include\x86emitter\implement\xmm\shufflepack.h"
 					>
 				</File>
 				<File
 					RelativePath="..\..\include\x86emitter\implement\simd_arithmetic.h"
 					>
@ -352,6 +344,14 @@
 					RelativePath="..\..\include\x86emitter\implement\simd_helpers.h"
 					>
 				</File>
 				<File
 					RelativePath="..\..\include\x86emitter\implement\simd_moremovs.h"
 					>
 				</File>
 				<File
 					RelativePath="..\..\include\x86emitter\implement\simd_shufflepack.h"
 					>
 				</File>
 				<File
 					RelativePath="..\..\include\x86emitter\implement\simd_templated_helpers.h"
 					>
--- a/common/include/x86emitter/implement/helpers.h
+++ b/common/include/x86emitter/implement/helpers.h
@ -23,7 +23,6 @@
 #pragma once
 #define OpWriteSSE( pre, op )		xOpWrite0F( pre, op, to, from )
 #define OpWriteMMX( op )			xOpWrite0F( op, to, from )
 extern void SimdPrefix( u8 prefix, u16 opcode );
 extern void EmitSibMagic( uint regfield, const void* address );
--- a/common/include/x86emitter/implement/movs.h
+++ b/common/include/x86emitter/implement/movs.h
@ -18,10 +18,11 @@
 // Header: ix86_impl_movs.h -- covers mov, cmov, movsx/movzx, and SETcc (which shares
 // with cmov many similarities).
 // Note: This header is meant to be included from within the x86Emitter::Internal namespace.
-//////////////////////////////////////////////////////////////////////////////////////////
+// --------------------------------------------------------------------------------------
-// MOV instruction Implementation
+//  MovImplAll
 // --------------------------------------------------------------------------------------
 // MOV instruction Implementation, plus many SIMD sub-mov variants.
 class MovImplAll
 {
--- a/common/include/x86emitter/implement/simd_moremovs.h
+++ b/common/include/x86emitter/implement/simd_moremovs.h
@ -0,0 +1,174 @@
 /*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2009  PCSX2 Dev Team
 * 
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 #pragma once
 namespace x86Emitter {
 // --------------------------------------------------------------------------------------
 //  xImplSimd_MovHL
 // --------------------------------------------------------------------------------------
 // Moves to/from high/low portions of an xmm register.
 // These instructions cannot be used in reg/reg form.
 //
 struct xImplSimd_MovHL
 {
 	u16	Opcode;
 	void PS( const xRegisterSSE& to, const ModSibBase& from ) const;
 	void PS( const ModSibBase& to, const xRegisterSSE& from ) const;
 	void PD( const xRegisterSSE& to, const ModSibBase& from ) const;
 	void PD( const ModSibBase& to, const xRegisterSSE& from ) const;
 };
 // --------------------------------------------------------------------------------------
 //  xImplSimd_MovHL_RtoR
 // --------------------------------------------------------------------------------------
 // RegtoReg forms of MOVHL/MOVLH -- these are the same opcodes as MOVH/MOVL but
 // do something kinda different! Fun!
 //
 struct xImplSimd_MovHL_RtoR
 {
 	u16	Opcode;
 	void PS( const xRegisterSSE& to, const xRegisterSSE& from ) const;
 	void PD( const xRegisterSSE& to, const xRegisterSSE& from ) const;
 };
 // --------------------------------------------------------------------------------------
 //  xImplSimd_MoveSSE
 // --------------------------------------------------------------------------------------
 // Legends in their own right: MOVAPS / MOVAPD / MOVUPS / MOVUPD
 //
 // All implementations of Unaligned Movs will, when possible, use aligned movs instead.
 // This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
 // which can be checked for alignment at runtime.
 //
 struct xImplSimd_MoveSSE
 {
 	u8		Prefix;
 	bool	isAligned;
 	void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const;
 	void operator()( const xRegisterSSE& to, const ModSibBase& from ) const;
 	void operator()( const ModSibBase& to, const xRegisterSSE& from ) const;
 };
 // --------------------------------------------------------------------------------------
 //  xImplSimd_MoveDQ
 // --------------------------------------------------------------------------------------
 // Implementations for MOVDQA / MOVDQU
 //
 // All implementations of Unaligned Movs will, when possible, use aligned movs instead.
 // This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
 // which can be checked for alignment at runtime.
 struct xImplSimd_MoveDQ
 {
 	u8		Prefix;
 	bool	isAligned;
 	void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const;
 	void operator()( const xRegisterSSE& to, const ModSibBase& from ) const;
 	void operator()( const ModSibBase& to, const xRegisterSSE& from ) const;
 };
 // --------------------------------------------------------------------------------------
 //  xImplSimd_Blend
 // --------------------------------------------------------------------------------------
 // Blend - Conditional copying of values in src into dest.
 //
 struct xImplSimd_Blend
 {
 	// [SSE-4.1] Conditionally copies dword values from src to dest, depending on the
 	// mask bits in the immediate operand (bits [3:0]).  Each mask bit corresponds to a
 	// dword element in a 128-bit operand. 
 	//
 	// If a mask bit is 1, then the corresponding dword in the source operand is copied
 	// to dest, else the dword element in dest is left unchanged.
 	//
 	xImplSimd_DestRegImmSSE	PS;
 	// [SSE-4.1] Conditionally copies quadword values from src to dest, depending on the
 	// mask bits in the immediate operand (bits [1:0]).  Each mask bit corresponds to a
 	// quadword element in a 128-bit operand. 
 	//
 	// If a mask bit is 1, then the corresponding dword in the source operand is copied
 	// to dest, else the dword element in dest is left unchanged.
 	//
 	xImplSimd_DestRegImmSSE	PD;
 	// [SSE-4.1] Conditionally copies dword values from src to dest, depending on the
 	// mask (bits [3:0]) in XMM0 (yes, the fixed register).  Each mask bit corresponds
 	// to a dword element in the 128-bit operand. 
 	//
 	// If a mask bit is 1, then the corresponding dword in the source operand is copied
 	// to dest, else the dword element in dest is left unchanged.
 	//
 	xImplSimd_DestRegSSE	VPS;
 	// [SSE-4.1] Conditionally copies quadword values from src to dest, depending on the
 	// mask (bits [1:0]) in XMM0 (yes, the fixed register).  Each mask bit corresponds
 	// to a quadword element in the 128-bit operand. 
 	//
 	// If a mask bit is 1, then the corresponding dword in the source operand is copied
 	// to dest, else the dword element in dest is left unchanged.
 	//
 	xImplSimd_DestRegSSE	VPD;
 };
 // --------------------------------------------------------------------------------------
 //  xImplSimd_PMove
 // --------------------------------------------------------------------------------------
 // Packed Move with Sign or Zero extension.
 //
 struct xImplSimd_PMove
 {
 	u16		OpcodeBase;
 	// [SSE-4.1] Zero/Sign-extend the low byte values in src into word integers
 	// and store them in dest.
 	void BW( const xRegisterSSE& to, const xRegisterSSE& from ) const;
 	void BW( const xRegisterSSE& to, const ModSibStrict<u64>& from ) const;
 	// [SSE-4.1] Zero/Sign-extend the low byte values in src into dword integers
 	// and store them in dest.
 	void BD( const xRegisterSSE& to, const xRegisterSSE& from ) const;
 	void BD( const xRegisterSSE& to, const ModSibStrict<u32>& from ) const;
 	// [SSE-4.1] Zero/Sign-extend the low byte values in src into qword integers
 	// and store them in dest.
 	void BQ( const xRegisterSSE& to, const xRegisterSSE& from ) const;
 	void BQ( const xRegisterSSE& to, const ModSibStrict<u16>& from ) const;
 	// [SSE-4.1] Zero/Sign-extend the low word values in src into dword integers
 	// and store them in dest.
 	void WD( const xRegisterSSE& to, const xRegisterSSE& from ) const;
 	void WD( const xRegisterSSE& to, const ModSibStrict<u64>& from ) const;
 	// [SSE-4.1] Zero/Sign-extend the low word values in src into qword integers
 	// and store them in dest.
 	void WQ( const xRegisterSSE& to, const xRegisterSSE& from ) const;
 	void WQ( const xRegisterSSE& to, const ModSibStrict<u32>& from ) const;
 	// [SSE-4.1] Zero/Sign-extend the low dword values in src into qword integers
 	// and store them in dest.
 	void DQ( const xRegisterSSE& to, const xRegisterSSE& from ) const;
 	void DQ( const xRegisterSSE& to, const ModSibStrict<u64>& from ) const;
 };
 }
--- a/common/include/x86emitter/implement/simd_shufflepack.h
+++ b/common/include/x86emitter/implement/simd_shufflepack.h
@ -15,50 +15,44 @@
 #pragma once
-//////////////////////////////////////////////////////////////////////////////////////////
+namespace x86Emitter {
-//
+
-template< u16 OpcodeSSE >
+// --------------------------------------------------------------------------------------
-class SimdImpl_Shuffle
+//  xImplSimd_Shuffle
 // --------------------------------------------------------------------------------------
 struct xImplSimd_Shuffle
 {
-protected:
+	inline void _selector_assertion_check( u8 selector ) const;
-	template< u8 Prefix > struct Woot
+
-	{
+	void PS( const xRegisterSSE& to, const xRegisterSSE& from, u8 selector ) const;
-		__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from, u8 cmptype ) const	{ xOpWrite0F( Prefix, OpcodeSSE, to, from ); xWrite8( cmptype ); }
+	void PS( const xRegisterSSE& to, const ModSibBase& from, u8 selector ) const;
-		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 cmptype ) const	{ xOpWrite0F( Prefix, OpcodeSSE, to, from ); xWrite8( cmptype ); }
+
-		Woot() {}
+	void PD( const xRegisterSSE& to, const xRegisterSSE& from, u8 selector ) const;
 	void PD( const xRegisterSSE& to, const ModSibBase& from, u8 selector ) const;
 };
-public:
+// --------------------------------------------------------------------------------------
-	const Woot<0x00> PS;
+//  xImplSimd_PShuffle
-	const Woot<0x66> PD;
+// --------------------------------------------------------------------------------------
-
+struct xImplSimd_PShuffle
 	SimdImpl_Shuffle() {} //GCWhat?
 };
 //////////////////////////////////////////////////////////////////////////////////////////
 //
 class SimdImpl_PShuffle
 {
 public:
 	SimdImpl_PShuffle() {}
 	// Copies words from src and inserts them into dest at word locations selected with
 	// the order operand (8 bit immediate).
-	const SimdImpl_DestRegImmMMX<0x00,0x70> W;
+	const xImplSimd_DestRegImmMMX	W;
 	// Copies doublewords from src and inserts them into dest at dword locations selected
 	// with the order operand (8 bit immediate).
-	const SimdImpl_DestRegImmSSE<0x66,0x70> D;
+	const xImplSimd_DestRegImmSSE	D;
 	// Copies words from the low quadword of src and inserts them into the low quadword
 	// of dest at word locations selected with the order operand (8 bit immediate).
 	// The high quadword of src is copied to the high quadword of dest.
-	const SimdImpl_DestRegImmSSE<0xf2,0x70> LW;
+	const xImplSimd_DestRegImmSSE	LW;
 	// Copies words from the high quadword of src and inserts them into the high quadword
 	// of dest at word locations selected with the order operand (8 bit immediate).
 	// The low quadword of src is copied to the low quadword of dest.
-	const SimdImpl_DestRegImmSSE<0xf3,0x70> HW;
+	const xImplSimd_DestRegImmSSE	HW;
 	// [sSSE-3] Performs in-place shuffles of bytes in dest according to the shuffle
 	// control mask in src.  If the most significant bit (bit[7]) of each byte of the
@ -68,68 +62,62 @@ public:
 	// operation) or 3 bits (64-bit operation) of the shuffle control byte.
 	//
 	// Operands can be MMX or XMM registers.
-	const SimdImpl_DestRegEither<0x66,0x0038> B;
+	const xImplSimd_DestRegEither	B;
 };
-//////////////////////////////////////////////////////////////////////////////////////////
+// --------------------------------------------------------------------------------------
-//
+//  SimdImpl_PUnpack
-class SimdImpl_PUnpack
+// --------------------------------------------------------------------------------------
 struct SimdImpl_PUnpack
 {
 public:
 	SimdImpl_PUnpack() {}
 	// Unpack and interleave low-order bytes from src and dest into dest.
-	const SimdImpl_DestRegEither<0x66,0x60> LBW;
+	const xImplSimd_DestRegEither	LBW;
 	// Unpack and interleave low-order words from src and dest into dest.
-	const SimdImpl_DestRegEither<0x66,0x61> LWD;
+	const xImplSimd_DestRegEither	LWD;
 	// Unpack and interleave low-order doublewords from src and dest into dest.
-	const SimdImpl_DestRegEither<0x66,0x62> LDQ;
+	const xImplSimd_DestRegEither	LDQ;
 	// Unpack and interleave low-order quadwords from src and dest into dest.
-	const SimdImpl_DestRegSSE<0x66,0x6c> LQDQ;
+	const xImplSimd_DestRegSSE		LQDQ;
 	// Unpack and interleave high-order bytes from src and dest into dest.
-	const SimdImpl_DestRegEither<0x66,0x68> HBW;
+	const xImplSimd_DestRegEither	HBW;
 	// Unpack and interleave high-order words from src and dest into dest.
-	const SimdImpl_DestRegEither<0x66,0x69> HWD;
+	const xImplSimd_DestRegEither	HWD;
 	// Unpack and interleave high-order doublewords from src and dest into dest.
-	const SimdImpl_DestRegEither<0x66,0x6a> HDQ;
+	const xImplSimd_DestRegEither	HDQ;
 	// Unpack and interleave high-order quadwords from src and dest into dest.
-	const SimdImpl_DestRegSSE<0x66,0x6d> HQDQ;
+	const xImplSimd_DestRegSSE		HQDQ;
 };
-//////////////////////////////////////////////////////////////////////////////////////////
+// --------------------------------------------------------------------------------------
 //  SimdImpl_Pack
 // --------------------------------------------------------------------------------------
 // Pack with Signed or Unsigned Saturation
 //
-class SimdImpl_Pack
+struct SimdImpl_Pack
 {
 public:
 	SimdImpl_Pack() {}
 	// Converts packed signed word integers from src and dest into packed signed 
 	// byte integers in dest, using signed saturation.
-	const SimdImpl_DestRegEither<0x66,0x63> SSWB;
+	const xImplSimd_DestRegEither	SSWB;
 	// Converts packed signed dword integers from src and dest into packed signed 
 	// word integers in dest, using signed saturation.
-	const SimdImpl_DestRegEither<0x66,0x6b> SSDW;
+	const xImplSimd_DestRegEither	SSDW;
 	// Converts packed unsigned word integers from src and dest into packed unsigned 
 	// byte integers in dest, using unsigned saturation.
-	const SimdImpl_DestRegEither<0x66,0x67> USWB;
+	const xImplSimd_DestRegEither	USWB;
 	// [SSE-4.1] Converts packed unsigned dword integers from src and dest into packed
 	// unsigned word integers in dest, using signed saturation.
-	const SimdImpl_DestRegSSE<0x66,0x2b38> USDW;
+	const xImplSimd_DestRegSSE		USDW;
 };
-
+// --------------------------------------------------------------------------------------
-//////////////////////////////////////////////////////////////////////////////////////////
+//  SimdImpl_Unpack
-//
+// --------------------------------------------------------------------------------------
-class SimdImpl_Unpack
+struct xImplSimd_Unpack
 {
 public:
 	SimdImpl_Unpack() {}
 	// Unpacks the high doubleword [single-precision] values from src and dest into
 	// dest, such that the result of dest looks like this:
 	//    dest[0] <- dest[2]
@ -137,14 +125,14 @@ public:
 	//    dest[2] <- dest[3]
 	//    dest[3] <- src[3]
 	//
-	const SimdImpl_DestRegSSE<0x00,0x15> HPS;
+	const xImplSimd_DestRegSSE		HPS;
 	// Unpacks the high quadword [double-precision] values from src and dest into
 	// dest, such that the result of dest looks like this:
 	//    dest.lo <- dest.hi
 	//    dest.hi <- src.hi
 	//
-	const SimdImpl_DestRegSSE<0x66,0x15> HPD;
+	const xImplSimd_DestRegSSE		HPD;
 	// Unpacks the low doubleword [single-precision] values from src and dest into
 	// dest, such that the result of dest looks like this:
@ -153,7 +141,7 @@ public:
 	//    dest[1] <- src[0]
 	//    dest[0] <- dest[0]
 	//
-	const SimdImpl_DestRegSSE<0x00,0x14> LPS;
+	const xImplSimd_DestRegSSE		LPS;
 	// Unpacks the low quadword [double-precision] values from src and dest into
 	// dest, effectively moving the low portion of src into the upper portion of dest.
@ -161,47 +149,39 @@ public:
 	//    dest.hi <- src.lo
 	//    dest.lo <- dest.lo  [remains unchanged!]
 	//
-	const SimdImpl_DestRegSSE<0x66,0x14> LPD;
+	const xImplSimd_DestRegSSE		LPD;
 };
-//////////////////////////////////////////////////////////////////////////////////////////
+
 struct xImplSimd_InsertExtractHelper
 {
 	u16		Opcode;
 	// [SSE-4.1] Allowed with SSE registers only (MMX regs are invalid)
 	void operator()( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const;
 	// [SSE-4.1] Allowed with SSE registers only (MMX regs are invalid)
 	void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const;
 };
 // --------------------------------------------------------------------------------------
 //  SimdImpl_PInsert
 // --------------------------------------------------------------------------------------
 // PINSRW/B/D [all but Word form are SSE4.1 only!]
 //
-class SimdImpl_PInsert
+struct xImplSimd_PInsert
 {
-protected:
+	void W( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const;
-	template< u16 Opcode >
+	void W( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const;
 	class ByteDwordForms
 	{
 	public:
 		ByteDwordForms() {}
-		__forceinline void operator()( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const
+	void W( const xRegisterMMX& to, const xRegister32& from, u8 imm8 ) const;
-		{
+	void W( const xRegisterMMX& to, const ModSibBase& from, u8 imm8 ) const;
 			xOpWrite0F( 0x66, (Opcode<<8) | 0x3a, to, from, imm8 );
 		}
-		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const
+	// [SSE-4.1] Allowed with SSE registers only (MMX regs are invalid)
-		{
+	xImplSimd_InsertExtractHelper	B;
 			xOpWrite0F( 0x66, (Opcode<<8) | 0x3a, to, from, imm8 );
 		}
 	};
-public:
+	// [SSE-4.1] Allowed with SSE registers only (MMX regs are invalid)
-	SimdImpl_PInsert() {}
+	xImplSimd_InsertExtractHelper	D;
 	// Operation can be performed on either MMX or SSE src operands.
 	__forceinline void W( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const	{ xOpWrite0F( 0x66, 0xc4, to, from, imm8 ); }
 	__forceinline void W( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const	{ xOpWrite0F( 0x66, 0xc4, to, from, imm8 ); }
 	__forceinline void W( const xRegisterMMX& to, const xRegister32& from, u8 imm8 ) const	{ xOpWrite0F( 0xc4, to, from, imm8 ); }
 	__forceinline void W( const xRegisterMMX& to, const ModSibBase& from, u8 imm8 ) const	{ xOpWrite0F( 0xc4, to, from, imm8 ); }
 	// [SSE-4.1] 
 	const ByteDwordForms<0x20> B;
 	// [SSE-4.1]
 	const ByteDwordForms<0x22> D;
 };
@ -210,47 +190,26 @@ public:
 //
 // Note: Word form's indirect memory form is only available in SSE4.1.
 //
-class SimdImpl_PExtract
+struct SimdImpl_PExtract
 {
 protected:
 	template< u16 Opcode >
 	class ByteDwordForms
 	{
 	public:
 		ByteDwordForms() {}
 		__forceinline void operator()( const xRegister32& to, const xRegisterSSE& from, u8 imm8 ) const
 		{
 			xOpWrite0F( 0x66, (Opcode<<8) | 0x3a, to, from, imm8 );
 		}
 		__forceinline void operator()( const ModSibBase& dest, const xRegisterSSE& from, u8 imm8 ) const
 		{
 			xOpWrite0F( 0x66, (Opcode<<8) | 0x3a, from, dest, imm8 );
 		}
 	};
 public:
 	SimdImpl_PExtract() {}
 	// Copies the word element specified by imm8 from src to dest.  The upper bits
 	// of dest are zero-extended (cleared).  This can be used to extract any single packed
 	// word value from src into an x86 32 bit register.
 	//
 	// [SSE-4.1] Note: Indirect memory forms of this instruction are an SSE-4.1 extension!
 	//
-	__forceinline void W( const xRegister32& to, const xRegisterSSE& from, u8 imm8 ) const		{ xOpWrite0F( 0x66, 0xc5, to, from, imm8 ); }
+	void W( const xRegister32& to, const xRegisterSSE& from, u8 imm8 ) const;
-	__forceinline void W( const xRegister32& to, const xRegisterMMX& from, u8 imm8 ) const		{ xOpWrite0F( 0xc5, to, from, imm8 ); }
+	void W( const xRegister32& to, const xRegisterMMX& from, u8 imm8 ) const;
-
+	void W( const ModSibBase& dest, const xRegisterSSE& from, u8 imm8 ) const;
 	__forceinline void W( const ModSibBase& dest, const xRegisterSSE& from, u8 imm8 ) const		{ xOpWrite0F( 0x66, 0x153a, from, dest, imm8 ); }
 	// [SSE-4.1] Copies the byte element specified by imm8 from src to dest.  The upper bits
 	// of dest are zero-extended (cleared).  This can be used to extract any single packed
 	// byte value from src into an x86 32 bit register.
-	const ByteDwordForms<0x14> B;
+	const xImplSimd_InsertExtractHelper	B;
 	// [SSE-4.1] Copies the dword element specified by imm8 from src to dest.  This can be
 	// used to extract any single packed dword value from src into an x86 32 bit register.
-	const ByteDwordForms<0x16> D;
+	const xImplSimd_InsertExtractHelper	D;
 };
 }
--- a/common/include/x86emitter/implement/simd_templated_helpers.h
+++ b/common/include/x86emitter/implement/simd_templated_helpers.h
@ -36,64 +36,3 @@ public:
 	SimdImpl_DestRegSSE() {} //GCWho?
 };
 // ------------------------------------------------------------------------
 // For implementing SSE-only logic operations that have xmmreg,reg/rm,imm forms only
 // (PSHUFD / PSHUFHW / etc).
 //
 template< u8 Prefix, u16 Opcode >
 class SimdImpl_DestRegImmSSE
 {
 public:
 	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const	{ xOpWrite0F( Prefix, Opcode, to, from, imm ); }
 	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm ) const	{ xOpWrite0F( Prefix, Opcode, to, from, imm ); }
 	SimdImpl_DestRegImmSSE() {} //GCWho?
 };
 template< u8 Prefix, u16 Opcode >
 class SimdImpl_DestRegImmMMX
 {
 public:
 	__forceinline void operator()( const xRegisterMMX& to, const xRegisterMMX& from, u8 imm ) const	{ xOpWrite0F( Opcode, to, from, imm ); }
 	__forceinline void operator()( const xRegisterMMX& to, const ModSibBase& from, u8 imm ) const	{ xOpWrite0F( Opcode, to, from, imm ); }
 	SimdImpl_DestRegImmMMX() {} //GCWho?
 };
 // ------------------------------------------------------------------------
 // For implementing MMX/SSE operations that have reg,reg/rm forms only,
 // but accept either MM or XMM destinations (most PADD/PSUB and other P srithmetic ops).
 //
 template< u8 Prefix, u16 Opcode >
 class SimdImpl_DestRegEither
 {
 public:
 	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const	{ xOpWrite0F( Prefix, Opcode, to, from ); }
 	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ xOpWrite0F( Prefix, Opcode, to, from ); }
 	__forceinline void operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const	{ xOpWrite0F( Opcode, to, from ); }
 	__forceinline void operator()( const xRegisterMMX& to, const ModSibBase& from ) const	{ xOpWrite0F( Opcode, to, from ); }
 	SimdImpl_DestRegEither() {} //GCWho?
 };
 // ------------------------------------------------------------------------
 // For implementing MMX/SSE operations where the destination *must* be a register, but the
 // source can be Direct or Indirect (ModRM/SibSB).  The SrcOperandType template parameter
 // is used to enforce type strictness of the (void*) parameter and ModSib<> parameter, so
 // that the programmer must be explicit in specifying desired operand size.
 //
 // IMPORTANT: This helper assumes the prefix opcode is written *always* -- regardless of
 // MMX or XMM register status.
 //
 template< u8 Prefix, u16 Opcode, typename DestRegType, typename SrcRegType, typename SrcOperandType >
 class SimdImpl_DestRegStrict
 {
 public:
 	__forceinline void operator()( const DestRegType& to, const SrcRegType& from ) const					{ xOpWrite0F( Prefix, Opcode, to, from ); }
 	__forceinline void operator()( const DestRegType& to, const ModSibStrict<SrcOperandType>& from ) const	{ xOpWrite0F( Prefix, Opcode, to, from ); }
 	SimdImpl_DestRegStrict() {} //GCWho?
 };
--- a/common/include/x86emitter/implement/xmm/moremovs.h
+++ b/common/include/x86emitter/implement/xmm/moremovs.h
@ -1,211 +0,0 @@
 /*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2009  PCSX2 Dev Team
 * 
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 #pragma once
 //////////////////////////////////////////////////////////////////////////////////////////
 // Moves to/from high/low portions of an xmm register.
 // These instructions cannot be used in reg/reg form.
 //
 template< u16 Opcode >
 class MovhlImplAll
 {
 protected:
 	template< u8 Prefix >
 	struct Woot
 	{
 		Woot() {}
 		__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ xOpWrite0F( Prefix, Opcode, to, from ); }
 		__forceinline void operator()( const ModSibBase& to, const xRegisterSSE& from ) const	{ xOpWrite0F( Prefix, Opcode+1, from, to ); }
 	};
 public:
 	const Woot<0x00> PS;
 	const Woot<0x66> PD;
 	MovhlImplAll() {} //GCC.
 };
 // ------------------------------------------------------------------------
 // RegtoReg forms of MOVHL/MOVLH -- these are the same opcodes as MOVH/MOVL but
 // do something kinda different! Fun!
 //
 template< u16 Opcode >
 class MovhlImpl_RtoR
 {
 public:
 	__forceinline void PS( const xRegisterSSE& to, const xRegisterSSE& from ) const			{ xOpWrite0F( Opcode, to, from ); }
 	__forceinline void PD( const xRegisterSSE& to, const xRegisterSSE& from ) const			{ xOpWrite0F( 0x66, Opcode, to, from ); }
 	MovhlImpl_RtoR() {} //GCC.
 };
 //////////////////////////////////////////////////////////////////////////////////////////
 // Legends in their own right: MOVAPS / MOVAPD / MOVUPS / MOVUPD
 //
 // All implementations of Unaligned Movs will, when possible, use aligned movs instead.
 // This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
 // which can be checked for alignment at runtime.
 // 
 template< u8 Prefix, bool isAligned >
 class SimdImpl_MoveSSE
 {
 	static const u16 OpcodeA = 0x28;		// Aligned [aps] form
 	static const u16 OpcodeU = 0x10;		// unaligned [ups] form
 public:
 	SimdImpl_MoveSSE() {} //GCC.
 	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const
 	{
 		if( to != from ) xOpWrite0F( Prefix, OpcodeA, to, from );
 	}
 	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const
 	{
 		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
 		bool isReallyAligned = isAligned || ( ((from.Displacement & 0x0f) == 0) && from.Index.IsEmpty() && from.Base.IsEmpty() );
 		u16 opcode;
 		// See previous comment.
 		if (isReallyAligned) 
 			opcode = OpcodeA;
 		else 
 			opcode = OpcodeU;
 		xOpWrite0F( Prefix, opcode, to, from );
 	}
 	__forceinline void operator()( const ModSibBase& to, const xRegisterSSE& from ) const
 	{
 		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
 		bool isReallyAligned = isAligned || ( (to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty() );
 		xOpWrite0F( Prefix, isReallyAligned ? OpcodeA+1 : OpcodeU+1, from, to );
 	}
 };
 //////////////////////////////////////////////////////////////////////////////////////////
 // Implementations for MOVDQA / MOVDQU
 //
 template< u8 Prefix, bool isAligned >
 class SimdImpl_MoveDQ
 {
 	static const u8 PrefixA = 0x66;		// Aligned [aps] form
 	static const u8 PrefixU = 0xf3;		// unaligned [ups] form
 	static const u16 Opcode = 0x6f;
 	static const u16 Opcode_Alt = 0x7f; // alternate ModRM encoding (reverse src/dst)
 public:
 	SimdImpl_MoveDQ() {} //GCC.
 	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const
 	{
 		if( to != from ) xOpWrite0F( PrefixA, Opcode, to, from );
 	}
 	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const
 	{
 		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
 		bool isReallyAligned = isAligned || ( (from.Displacement & 0x0f) == 0 && from.Index.IsEmpty() && from.Base.IsEmpty() );
 		xOpWrite0F( isReallyAligned ? PrefixA : PrefixU, Opcode, to, from );
 	}
 	__forceinline void operator()( const ModSibBase& to, const xRegisterSSE& from ) const
 	{
 		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
 		bool isReallyAligned = isAligned || ( (to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty() );
 		xOpWrite0F( isReallyAligned ? PrefixA : PrefixU, Opcode_Alt, from, to );
 	}
 };
 //////////////////////////////////////////////////////////////////////////////////////////
 // Blend - Conditional copying of values in src into dest.
 //
 class SimdImpl_Blend
 {
 public:
 	// [SSE-4.1] Conditionally copies dword values from src to dest, depending on the
 	// mask bits in the immediate operand (bits [3:0]).  Each mask bit corresponds to a
 	// dword element in a 128-bit operand. 
 	//
 	// If a mask bit is 1, then the corresponding dword in the source operand is copied
 	// to dest, else the dword element in dest is left unchanged.
 	//
 	SimdImpl_DestRegImmSSE<0x66,0x0c3a> PS;
 	// [SSE-4.1] Conditionally copies quadword values from src to dest, depending on the
 	// mask bits in the immediate operand (bits [1:0]).  Each mask bit corresponds to a
 	// quadword element in a 128-bit operand. 
 	//
 	// If a mask bit is 1, then the corresponding dword in the source operand is copied
 	// to dest, else the dword element in dest is left unchanged.
 	//
 	SimdImpl_DestRegImmSSE<0x66,0x0d3a> PD;
 	// [SSE-4.1] Conditionally copies dword values from src to dest, depending on the
 	// mask (bits [3:0]) in XMM0 (yes, the fixed register).  Each mask bit corresponds
 	// to a dword element in the 128-bit operand. 
 	//
 	// If a mask bit is 1, then the corresponding dword in the source operand is copied
 	// to dest, else the dword element in dest is left unchanged.
 	//
 	SimdImpl_DestRegSSE<0x66,0x1438> VPS;
 	// [SSE-4.1] Conditionally copies quadword values from src to dest, depending on the
 	// mask (bits [1:0]) in XMM0 (yes, the fixed register).  Each mask bit corresponds
 	// to a quadword element in the 128-bit operand. 
 	//
 	// If a mask bit is 1, then the corresponding dword in the source operand is copied
 	// to dest, else the dword element in dest is left unchanged.
 	//
 	SimdImpl_DestRegSSE<0x66,0x1538> VPD;
 };
 //////////////////////////////////////////////////////////////////////////////////////////
 // Packed Move with Sign or Zero extension.
 //
 template< bool SignExtend >
 class SimdImpl_PMove
 {
 	static const u16 OpcodeBase = SignExtend ? 0x2038 : 0x3038;
 public:
 	// [SSE-4.1] Zero/Sign-extend the low byte values in src into word integers
 	// and store them in dest.
 	SimdImpl_DestRegStrict<0x66,OpcodeBase,xRegisterSSE,xRegisterSSE,u64> BW;
 	// [SSE-4.1] Zero/Sign-extend the low byte values in src into dword integers
 	// and store them in dest.
 	SimdImpl_DestRegStrict<0x66,OpcodeBase+0x100,xRegisterSSE,xRegisterSSE,u32> BD;
 	// [SSE-4.1] Zero/Sign-extend the low byte values in src into qword integers
 	// and store them in dest.
 	SimdImpl_DestRegStrict<0x66,OpcodeBase+0x200,xRegisterSSE,xRegisterSSE,u16> BQ;
 	// [SSE-4.1] Zero/Sign-extend the low word values in src into dword integers
 	// and store them in dest.
 	SimdImpl_DestRegStrict<0x66,OpcodeBase+0x300,xRegisterSSE,xRegisterSSE,u64> WD;
 	// [SSE-4.1] Zero/Sign-extend the low word values in src into qword integers
 	// and store them in dest.
 	SimdImpl_DestRegStrict<0x66,OpcodeBase+0x400,xRegisterSSE,xRegisterSSE,u32> WQ;
 	// [SSE-4.1] Zero/Sign-extend the low dword values in src into qword integers
 	// and store them in dest.
 	SimdImpl_DestRegStrict<0x66,OpcodeBase+0x500,xRegisterSSE,xRegisterSSE,u64> DQ;
 };
--- a/common/include/x86emitter/instructions.h
+++ b/common/include/x86emitter/instructions.h
@ -393,29 +393,30 @@ namespace x86Emitter
 	// ------------------------------------------------------------------------
-	extern const Internal::SimdImpl_MoveSSE<0x00,true> xMOVAPS;
+	extern const xImplSimd_MoveSSE xMOVAPS;
-	extern const Internal::SimdImpl_MoveSSE<0x00,false> xMOVUPS;
+	extern const xImplSimd_MoveSSE xMOVUPS;
 	extern const xImplSimd_MoveSSE xMOVAPD;
 	extern const xImplSimd_MoveSSE xMOVUPD;
 #ifdef ALWAYS_USE_MOVAPS
-	extern const Internal::SimdImpl_MoveSSE<0,true> xMOVDQA;
+	extern const xImplSimd_MoveSSE xMOVDQA;
-	extern const Internal::SimdImpl_MoveSSE<0,false> xMOVDQU;
+	extern const xImplSimd_MoveSSE xMOVDQU;
 	extern const Internal::SimdImpl_MoveSSE<0,true> xMOVAPD;
 	extern const Internal::SimdImpl_MoveSSE<0,false> xMOVUPD;
 #else
-	extern const Internal::SimdImpl_MoveDQ<0x66, 0x6f, 0x7f> xMOVDQA;
+	extern const xImplSimd_MoveDQ xMOVDQA;
-	extern const Internal::SimdImpl_MoveDQ<0xf3, 0x6f, 0x7f> xMOVDQU;
+	extern const xImplSimd_MoveDQ xMOVDQU;
 	extern const Internal::SimdImpl_MoveSSE<0x66,true> xMOVAPD;
 	extern const Internal::SimdImpl_MoveSSE<0x66,false> xMOVUPD;
 #endif
-	extern const Internal::MovhlImpl_RtoR<0x16> xMOVLH;
+	extern const xImplSimd_MovHL xMOVH;
-	extern const Internal::MovhlImpl_RtoR<0x12> xMOVHL;
+	extern const xImplSimd_MovHL xMOVL;
 	extern const xImplSimd_MovHL_RtoR xMOVLH;
 	extern const xImplSimd_MovHL_RtoR xMOVHL;
-	extern const Internal::MovhlImplAll<0x16> xMOVH;
+	extern const xImplSimd_Blend xBLEND;
-	extern const Internal::MovhlImplAll<0x12> xMOVL;
+	extern const xImplSimd_PMove xPMOVSX;
 	extern const xImplSimd_PMove xPMOVZX;
-	extern const Internal::SimdImpl_DestRegSSE<0xf3,0x12> xMOVSLDUP;
+	extern const xImplSimd_DestRegSSE xMOVSLDUP;
-	extern const Internal::SimdImpl_DestRegSSE<0xf3,0x16> xMOVSHDUP;
+	extern const xImplSimd_DestRegSSE xMOVSHDUP;
 	extern void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 );
 	extern void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 );
@ -425,16 +426,16 @@ namespace x86Emitter
 	// ------------------------------------------------------------------------
-	extern const Internal::SimdImpl_DestRegEither<0x66,0xdb> xPAND;
+	extern const xImplSimd_DestRegEither xPAND;
-	extern const Internal::SimdImpl_DestRegEither<0x66,0xdf> xPANDN;
+	extern const xImplSimd_DestRegEither xPANDN;
-	extern const Internal::SimdImpl_DestRegEither<0x66,0xeb> xPOR;
+	extern const xImplSimd_DestRegEither xPOR;
-	extern const Internal::SimdImpl_DestRegEither<0x66,0xef> xPXOR;
+	extern const xImplSimd_DestRegEither xPXOR;
-	extern const Internal::SimdImpl_Shuffle<0xc6>	xSHUF;
+	extern const xImplSimd_Shuffle		xSHUF;
 	// ------------------------------------------------------------------------
-	extern const Internal::SimdImpl_DestRegSSE<0x66,0x1738> xPTEST;
+	extern const xImplSimd_DestRegSSE	xPTEST;
 	extern const xImplSimd_MinMax		xMIN;
 	extern const xImplSimd_MinMax		xMAX; 
@ -526,16 +527,12 @@ namespace x86Emitter
 	extern const xImplSimd_DotProduct		xDP;
 	extern const xImplSimd_Round			xROUND;
-	extern const Internal::SimdImpl_PShuffle			xPSHUF;
+	extern const xImplSimd_PShuffle			xPSHUF;
-	extern const Internal::SimdImpl_PUnpack				xPUNPCK;
+	extern const SimdImpl_PUnpack			xPUNPCK;
-	extern const Internal::SimdImpl_Unpack				xUNPCK;
+	extern const xImplSimd_Unpack			xUNPCK;
-	extern const Internal::SimdImpl_Pack				xPACK;
+	extern const SimdImpl_Pack				xPACK;
-	extern const Internal::SimdImpl_PInsert				xPINSR;
+	extern const xImplSimd_PInsert			xPINSR;
-	extern const Internal::SimdImpl_PExtract			xPEXTR;
+	extern const SimdImpl_PExtract			xPEXTR;
 	extern const Internal::SimdImpl_Blend				xBLEND;
 	extern const Internal::SimdImpl_PMove<true>			xPMOVSX;
 	extern const Internal::SimdImpl_PMove<false>		xPMOVZX;
 }
--- a/common/include/x86emitter/x86types.h
+++ b/common/include/x86emitter/x86types.h
@ -703,8 +703,6 @@ __forceinline void xWrite( T val )
 	{
 		#include "implement/helpers.h"
 		#include "implement/simd_templated_helpers.h"
 		#include "implement/xmm/moremovs.h"
 		#include "implement/xmm/shufflepack.h"
 		#include "implement/group1.h"
 		#include "implement/group2.h"
 		#include "implement/group3.h"
@ -737,7 +735,9 @@ __forceinline void xWrite( T val )
 }
 #include "implement/simd_helpers.h"
 #include "implement/simd_moremovs.h"
 #include "implement/simd_arithmetic.h"
 #include "implement/simd_comparisons.h"
 #include "implement/simd_shufflepack.h"
 #include "inlines.inl"
--- a/common/src/x86emitter/simd.cpp
+++ b/common/src/x86emitter/simd.cpp
@ -106,56 +106,19 @@ __emitinline void Internal::SimdPrefix( u8 prefix, u16 opcode )
 	}
 }
-// [SSE-3]
+const xImplSimd_DestRegEither xPAND		= { 0x66,0xdb };
-const SimdImpl_DestRegSSE<0xf3,0x12> xMOVSLDUP;
+const xImplSimd_DestRegEither xPANDN	= { 0x66,0xdf };
-// [SSE-3]
+const xImplSimd_DestRegEither xPOR		= { 0x66,0xeb };
-const SimdImpl_DestRegSSE<0xf3,0x16> xMOVSHDUP;
+const xImplSimd_DestRegEither xPXOR		= { 0x66,0xef };
 const SimdImpl_MoveSSE<0x00,true> xMOVAPS;
 // Note: All implementations of Unaligned Movs will, when possible, use aligned movs instead.
 // This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
 // which can be checked for alignment at runtime.
 const SimdImpl_MoveSSE<0x00,false> xMOVUPS;
 #ifdef ALWAYS_USE_MOVAPS
 const SimdImpl_MoveSSE<0,true> xMOVDQA;
 const SimdImpl_MoveSSE<0,true> xMOVAPD;
 // Note: All implementations of Unaligned Movs will, when possible, use aligned movs instead.
 // This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
 // which can be checked for alignment at runtime.
 const SimdImpl_MoveSSE<0,false> xMOVDQU;
 const SimdImpl_MoveSSE<0,false> xMOVUPD;
 #else
 const SimdImpl_MoveDQ<0x66, 0x6f, 0x7f> xMOVDQA;
 const SimdImpl_MoveDQ<0xf3, 0x6f, 0x7f> xMOVDQU;
 const SimdImpl_MoveSSE<0x66,true> xMOVAPD;
 const SimdImpl_MoveSSE<0x66,false> xMOVUPD;
 #endif
 const MovhlImplAll<0x16>		xMOVH;
 const MovhlImplAll<0x12>		xMOVL;
 const MovhlImpl_RtoR<0x16>		xMOVLH;
 const MovhlImpl_RtoR<0x12>		xMOVHL;
 const SimdImpl_Shuffle<0xc6>	xSHUF;
 const SimdImpl_DestRegEither<0x66,0xdb> xPAND;
 const SimdImpl_DestRegEither<0x66,0xdf> xPANDN;
 const SimdImpl_DestRegEither<0x66,0xeb> xPOR;
 const SimdImpl_DestRegEither<0x66,0xef> xPXOR;
 // ------------------------------------------------------------------------
 // [SSE-4.1] Performs a bitwise AND of dest against src, and sets the ZF flag
 // only if all bits in the result are 0.  PTEST also sets the CF flag according
 // to the following condition: (xmm2/m128 AND NOT xmm1) == 0;
-const SimdImpl_DestRegSSE<0x66,0x1738>		xPTEST;
+const xImplSimd_DestRegSSE		xPTEST = { 0x66,0x1738 };
-// ------------------------------------------------------------------------
+// =====================================================================================================
 // SSE Conversion Operations, as looney as they are.
-// 
+// =====================================================================================================
 // These enforce pointer strictness for Indirect forms, due to the otherwise completely confusing
 // nature of the functions.  (so if a function expects an m32, you must use (u32*) or ptr32[]).
 //
@ -227,8 +190,8 @@ void xImplSimd_DestRegImmMMX::operator()( const xRegisterMMX& to, const ModSibBa
 void xImplSimd_DestRegEither::operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const			{ OpWriteSSE( Prefix, Opcode ); }
 void xImplSimd_DestRegEither::operator()( const xRegisterSSE& to, const ModSibBase& from ) const			{ OpWriteSSE( Prefix, Opcode ); }
-void xImplSimd_DestRegEither::operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const			{ OpWriteMMX( Opcode ); }
+void xImplSimd_DestRegEither::operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const			{ OpWriteSSE( 0x00, Opcode ); }
-void xImplSimd_DestRegEither::operator()( const xRegisterMMX& to, const ModSibBase& from ) const			{ OpWriteMMX( Opcode ); }
+void xImplSimd_DestRegEither::operator()( const xRegisterMMX& to, const ModSibBase& from ) const			{ OpWriteSSE( 0x00, Opcode ); }
 // =====================================================================================================
 //  SIMD Arithmetic Instructions
@ -237,8 +200,8 @@ void xImplSimd_DestRegEither::operator()( const xRegisterMMX& to, const ModSibBa
 void _SimdShiftHelper::operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const			{ OpWriteSSE( Prefix, Opcode ); }
 void _SimdShiftHelper::operator()( const xRegisterSSE& to, const ModSibBase& from ) const			{ OpWriteSSE( Prefix, Opcode ); }
-void _SimdShiftHelper::operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const			{ OpWriteMMX( Opcode ); }
+void _SimdShiftHelper::operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const			{ OpWriteSSE( 0x00, Opcode ); }
-void _SimdShiftHelper::operator()( const xRegisterMMX& to, const ModSibBase& from ) const			{ OpWriteMMX( Opcode ); }
+void _SimdShiftHelper::operator()( const xRegisterMMX& to, const ModSibBase& from ) const			{ OpWriteSSE( 0x00, Opcode ); }
 void _SimdShiftHelper::operator()( const xRegisterSSE& to, u8 imm8 ) const
 {
@ -471,64 +434,231 @@ const xImplSimd_PMinMax xPMAX =
 	{ 0x66, 0x3f38 },		// UD
 };
-const SimdImpl_PShuffle xPSHUF;
+// =====================================================================================================
-const SimdImpl_PUnpack xPUNPCK;
+//  SIMD Shuffle/Pack  (Shuffle puck?)
-const SimdImpl_Unpack xUNPCK;
+// =====================================================================================================
 const SimdImpl_Pack xPACK;
 const SimdImpl_PInsert xPINSR;
 const SimdImpl_PExtract xPEXTR;
 const SimdImpl_Blend xBLEND;
-const SimdImpl_PMove<true> xPMOVSX;
+__forceinline void xImplSimd_Shuffle::_selector_assertion_check( u8 selector ) const
 const SimdImpl_PMove<false> xPMOVZX;
 //////////////////////////////////////////////////////////////////////////////////////////
 //
 // Converts from MMX register mode to FPU register mode.  The cpu enters MMX register mode
 // when ever MMX instructions are run, and if FPU instructions are run without using EMMS,
 // the FPU results will be invalid.
 __forceinline void xEMMS()	{ xWrite16( 0x770F ); }
 // [3DNow] Same as EMMS, but an AMD special version which may (or may not) leave MMX regs
 // in an undefined state (which is fine, since presumably you're done using them anyway).
 // This instruction is thus faster than EMMS on K8s, but all newer AMD cpus use the same
 // logic for either EMMS or FEMMS.
 // Conclusion: Obsolete.  Just use EMMS instead.
 __forceinline void xFEMMS()	{ xWrite16( 0x0E0F ); }
 // Store Streaming SIMD Extension Control/Status to Mem32.
 __emitinline void xSTMXCSR( const ModSib32& dest )
 {
-	SimdPrefix( 0, 0xae );
+	pxAssertMsg( (selector & ~3) == 0,
-	EmitSibMagic( 3, dest );
+		"Invalid immediate operand on SSE Shuffle: Upper 6 bits of the SSE Shuffle-PD Selector are reserved and must be zero."
 	);
 }
-// Load Streaming SIMD Extension Control/Status from Mem32.
+void xImplSimd_Shuffle::PS( const xRegisterSSE& to, const xRegisterSSE& from, u8 selector ) const
 __emitinline void xLDMXCSR( const ModSib32& src )
 {
-	SimdPrefix( 0, 0xae );
+	xOpWrite0F( 0xc6, to, from, selector );
 	EmitSibMagic( 2, src );
 }
-// Save x87 FPU, MMX Technology, and SSE State to buffer
+void xImplSimd_Shuffle::PS( const xRegisterSSE& to, const ModSibBase& from, u8 selector ) const
 // Target buffer must be at least 512 bytes in length to hold the result.
 __emitinline void xFXSAVE( const ModSibBase& dest )
 {
-	SimdPrefix( 0, 0xae );
+	xOpWrite0F( 0xc6, to, from, selector );
 	EmitSibMagic( 0, dest );
 }
-// Restore x87 FPU, MMX , XMM, and MXCSR State.
+void xImplSimd_Shuffle::PD( const xRegisterSSE& to, const xRegisterSSE& from, u8 selector ) const
 // Source buffer should be 512 bytes in length.
 __emitinline void xFXRSTOR( const ModSibBase& src )
 {
-	SimdPrefix( 0, 0xae );
+	_selector_assertion_check( selector );
-	EmitSibMagic( 1, src );
+	xOpWrite0F( 0x66, 0xc6, to, from, selector & 0x3 );
 }
 void xImplSimd_Shuffle::PD( const xRegisterSSE& to, const ModSibBase& from, u8 selector ) const
 {
 	_selector_assertion_check( selector );
 	xOpWrite0F( 0x66, 0xc6, to, from, selector & 0x3 );
 }
 void xImplSimd_InsertExtractHelper::operator()( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const
 {
 	xOpWrite0F( 0x66, Opcode, to, from, imm8 );
 }
 void xImplSimd_InsertExtractHelper::operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const
 {
 	xOpWrite0F( 0x66, Opcode, to, from, imm8 );
 }
 void xImplSimd_PInsert::W( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const		{ xOpWrite0F( 0x66, 0xc4, to, from, imm8 ); }
 void xImplSimd_PInsert::W( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const		{ xOpWrite0F( 0x66, 0xc4, to, from, imm8 ); }
 void xImplSimd_PInsert::W( const xRegisterMMX& to, const xRegister32& from, u8 imm8 ) const		{ xOpWrite0F( 0xc4, to, from, imm8 ); }
 void xImplSimd_PInsert::W( const xRegisterMMX& to, const ModSibBase& from, u8 imm8 ) const		{ xOpWrite0F( 0xc4, to, from, imm8 ); }
 void SimdImpl_PExtract::W( const xRegister32& to, const xRegisterSSE& from, u8 imm8 ) const		{ xOpWrite0F( 0x66, 0xc5, to, from, imm8 ); }
 void SimdImpl_PExtract::W( const xRegister32& to, const xRegisterMMX& from, u8 imm8 ) const		{ xOpWrite0F( 0xc5, to, from, imm8 ); }
 void SimdImpl_PExtract::W( const ModSibBase& dest, const xRegisterSSE& from, u8 imm8 ) const	{ xOpWrite0F( 0x66, 0x153a, from, dest, imm8 ); }
 const xImplSimd_Shuffle xSHUF;
 const xImplSimd_PShuffle xPSHUF =
 {
 	{ 0x00, 0x70 },		// W
 	{ 0x66, 0x70 },		// D
 	{ 0xf2, 0x70 },		// LW
 	{ 0xf3, 0x70 },		// HW
 	{ 0x66, 0x0038 },	// B
 };
 const SimdImpl_PUnpack xPUNPCK =
 {
 	{ 0x66, 0x60 },		// LBW
 	{ 0x66, 0x61 },		// LWD
 	{ 0x66, 0x62 },		// LDQ
 	{ 0x66, 0x6c },		// LQDQ
 	{ 0x66, 0x68 },		// HBW
 	{ 0x66, 0x69 },		// HWD
 	{ 0x66, 0x6a },		// HDQ
 	{ 0x66, 0x6d },		// HQDQ
 };
 const SimdImpl_Pack xPACK =
 {
 	{ 0x66, 0x63 },		// SSWB
 	{ 0x66, 0x6b },		// SSDW
 	{ 0x66, 0x67 },		// USWB
 	{ 0x66, 0x2b38 },	// USDW
 };
 const xImplSimd_Unpack xUNPCK =
 {
 	{ 0x00, 0x15 },		// HPS
 	{ 0x66, 0x15 },		// HPD
 	{ 0x00, 0x14 },		// LPS
 	{ 0x66, 0x14 },		// LPD
 };
 const xImplSimd_PInsert xPINSR =
 {
 	{ 0x203a },			// B
 	{ 0x223a },			// D
 };
 const SimdImpl_PExtract xPEXTR =
 {
 	{ 0x143a },			// B
 	{ 0x163a },			// D
 };
 // =====================================================================================================
 //  SIMD Move And Blend Instructions
 // =====================================================================================================
 void xImplSimd_MovHL::PS( const xRegisterSSE& to, const ModSibBase& from ) const			{ xOpWrite0F( Opcode, to, from ); }
 void xImplSimd_MovHL::PS( const ModSibBase& to, const xRegisterSSE& from ) const			{ xOpWrite0F( Opcode+1, from, to ); }
 void xImplSimd_MovHL::PD( const xRegisterSSE& to, const ModSibBase& from ) const			{ xOpWrite0F( 0x66, Opcode, to, from ); }
 void xImplSimd_MovHL::PD( const ModSibBase& to, const xRegisterSSE& from ) const			{ xOpWrite0F( 0x66, Opcode+1, from, to ); }
 void xImplSimd_MovHL_RtoR::PS( const xRegisterSSE& to, const xRegisterSSE& from ) const		{ xOpWrite0F( Opcode, to, from ); }
 void xImplSimd_MovHL_RtoR::PD( const xRegisterSSE& to, const xRegisterSSE& from ) const		{ xOpWrite0F( 0x66, Opcode, to, from ); }
 static const u16 MovPS_OpAligned		= 0x28;		// Aligned [aps] form
 static const u16 MovPS_OpUnaligned		= 0x10;		// unaligned [ups] form
 void xImplSimd_MoveSSE::operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const
 {
 	if( to != from ) xOpWrite0F( Prefix, MovPS_OpAligned, to, from );
 }
 void xImplSimd_MoveSSE::operator()( const xRegisterSSE& to, const ModSibBase& from ) const
 {
 	// ModSib form is aligned if it's displacement-only and the displacement is aligned:
 	bool isReallyAligned = isAligned || ( ((from.Displacement & 0x0f) == 0) && from.Index.IsEmpty() && from.Base.IsEmpty() );
 	xOpWrite0F( Prefix, isReallyAligned ? MovPS_OpAligned : MovPS_OpUnaligned, to, from );
 }
 void xImplSimd_MoveSSE::operator()( const ModSibBase& to, const xRegisterSSE& from ) const
 {
 	// ModSib form is aligned if it's displacement-only and the displacement is aligned:
 	bool isReallyAligned = isAligned || ( (to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty() );
 	xOpWrite0F( Prefix, isReallyAligned ? MovPS_OpAligned+1 : MovPS_OpUnaligned+1, from, to );
 }
 static const u8 MovDQ_PrefixAligned		= 0x66;		// Aligned [dqa] form
 static const u8 MovDQ_PrefixUnaligned	= 0xf3;		// unaligned [dqu] form
 void xImplSimd_MoveDQ::operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const
 {
 	if( to != from ) xOpWrite0F( MovDQ_PrefixAligned, 0x6f, to, from );
 }
 void xImplSimd_MoveDQ::operator()( const xRegisterSSE& to, const ModSibBase& from ) const
 {
 	// ModSib form is aligned if it's displacement-only and the displacement is aligned:
 	bool isReallyAligned = isAligned || ( (from.Displacement & 0x0f) == 0 && from.Index.IsEmpty() && from.Base.IsEmpty() );
 	xOpWrite0F( isReallyAligned ? MovDQ_PrefixAligned : MovDQ_PrefixUnaligned, 0x6f, to, from );
 }
 void xImplSimd_MoveDQ::operator()( const ModSibBase& to, const xRegisterSSE& from ) const
 {
 	// ModSib form is aligned if it's displacement-only and the displacement is aligned:
 	bool isReallyAligned = isAligned || ( (to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty() );
 	// use opcode 0x7f : alternate ModRM encoding (reverse src/dst)
 	xOpWrite0F( isReallyAligned ? MovDQ_PrefixAligned : MovDQ_PrefixUnaligned, 0x7f, from, to );
 }
 void xImplSimd_PMove::BW( const xRegisterSSE& to, const xRegisterSSE& from ) const		{ OpWriteSSE( 0x66, OpcodeBase ); }
 void xImplSimd_PMove::BW( const xRegisterSSE& to, const ModSibStrict<u64>& from ) const	{ OpWriteSSE( 0x66, OpcodeBase ); }
 void xImplSimd_PMove::BD( const xRegisterSSE& to, const xRegisterSSE& from ) const		{ OpWriteSSE( 0x66, OpcodeBase+0x100 ); }
 void xImplSimd_PMove::BD( const xRegisterSSE& to, const ModSibStrict<u32>& from ) const	{ OpWriteSSE( 0x66, OpcodeBase+0x100 ); }
 void xImplSimd_PMove::BQ( const xRegisterSSE& to, const xRegisterSSE& from ) const		{ OpWriteSSE( 0x66, OpcodeBase+0x200 ); }
 void xImplSimd_PMove::BQ( const xRegisterSSE& to, const ModSibStrict<u16>& from ) const	{ OpWriteSSE( 0x66, OpcodeBase+0x200 ); }
 void xImplSimd_PMove::WD( const xRegisterSSE& to, const xRegisterSSE& from ) const		{ OpWriteSSE( 0x66, OpcodeBase+0x300 ); }
 void xImplSimd_PMove::WD( const xRegisterSSE& to, const ModSibStrict<u64>& from ) const	{ OpWriteSSE( 0x66, OpcodeBase+0x300 ); }
 void xImplSimd_PMove::WQ( const xRegisterSSE& to, const xRegisterSSE& from ) const		{ OpWriteSSE( 0x66, OpcodeBase+0x400 ); }
 void xImplSimd_PMove::WQ( const xRegisterSSE& to, const ModSibStrict<u32>& from ) const	{ OpWriteSSE( 0x66, OpcodeBase+0x400 ); }
 void xImplSimd_PMove::DQ( const xRegisterSSE& to, const xRegisterSSE& from ) const		{ OpWriteSSE( 0x66, OpcodeBase+0x500 ); }
 void xImplSimd_PMove::DQ( const xRegisterSSE& to, const ModSibStrict<u64>& from ) const	{ OpWriteSSE( 0x66, OpcodeBase+0x500 ); }
 const xImplSimd_MoveSSE xMOVAPS = { 0x00, true };
 const xImplSimd_MoveSSE xMOVUPS = { 0x00, false };
 #ifdef ALWAYS_USE_MOVAPS
 	const xImplSimd_MoveSSE xMOVDQA	= { 0x00, true };
 	const xImplSimd_MoveSSE xMOVAPD	= { 0x00, true };
 	const xImplSimd_MoveSSE xMOVDQU	= { 0x00, false };
 	const xImplSimd_MoveSSE xMOVUPD	= { 0x00, false };
 #else
 	const xImplSimd_MoveDQ xMOVDQA	= { 0x66, true };
 	const xImplSimd_MoveSSE xMOVAPD	= { 0x66, true };
 	const xImplSimd_MoveDQ xMOVDQU	= { 0xf3, false };
 	const xImplSimd_MoveSSE xMOVUPD	= { 0x66, false };
 #endif
 const xImplSimd_MovHL xMOVH = { 0x16 };
 const xImplSimd_MovHL xMOVL = { 0x12 };
 const xImplSimd_MovHL_RtoR xMOVLH = { 0x16 };
 const xImplSimd_MovHL_RtoR xMOVHL = { 0x12 };
 const xImplSimd_Blend xBLEND =
 {
 	{ 0x66, 0x0c3a },		// PS
 	{ 0x66, 0x0d3a },		// PD
 	{ 0x66, 0x1438 },		// VPS
 	{ 0x66, 0x1538 },		// VPD
 };
 const xImplSimd_PMove xPMOVSX = { 0x2038 };
 const xImplSimd_PMove xPMOVZX = { 0x3038 };
 // [SSE-3]
 const xImplSimd_DestRegSSE xMOVSLDUP = { 0xf3,0x12 };
 // [SSE-3]
 const xImplSimd_DestRegSSE xMOVSHDUP = { 0xf3,0x16 };
 //////////////////////////////////////////////////////////////////////////////////////////
 // MMX Mov Instructions (MOVD, MOVQ, MOVSS).
 //
@ -645,9 +775,9 @@ __forceinline void xPALIGNR( const xRegisterSSE& to, const xRegisterSSE& from, u
 __forceinline void xPALIGNR( const xRegisterMMX& to, const xRegisterMMX& from, u8 imm8 )	{ xOpWrite0F( 0x0f3a, to, from, imm8 ); }
-//////////////////////////////////////////////////////////////////////////////////////////
+// --------------------------------------------------------------------------------------
 //  INSERTPS / EXTRACTPS   [SSE4.1 only!]
-//
+// --------------------------------------------------------------------------------------
 // [TODO] these might be served better as classes, especially if other instructions use
 // the M32,sse,imm form (I forget offhand if any do).
@ -674,4 +804,52 @@ __emitinline void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& fr
 __emitinline void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 )		{ xOpWrite0F( 0x66, 0x173a, to, from, imm8 ); }
 __emitinline void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 ){ xOpWrite0F( 0x66, 0x173a, from, dest, imm8 ); }
 // =====================================================================================================
 //  Ungrouped Instructions!
 // =====================================================================================================
 // Converts from MMX register mode to FPU register mode.  The cpu enters MMX register mode
 // when ever MMX instructions are run, and if FPU instructions are run without using EMMS,
 // the FPU results will be invalid.
 __forceinline void xEMMS()	{ xWrite16( 0x770F ); }
 // [3DNow] Same as EMMS, but an AMD special version which may (or may not) leave MMX regs
 // in an undefined state (which is fine, since presumably you're done using them anyway).
 // This instruction is thus faster than EMMS on K8s, but all newer AMD cpus use the same
 // logic for either EMMS or FEMMS.
 // Conclusion: Obsolete.  Just use EMMS instead.
 __forceinline void xFEMMS()	{ xWrite16( 0x0E0F ); }
 // Store Streaming SIMD Extension Control/Status to Mem32.
 __emitinline void xSTMXCSR( const ModSib32& dest )
 {
 	SimdPrefix( 0, 0xae );
 	EmitSibMagic( 3, dest );
 }
 // Load Streaming SIMD Extension Control/Status from Mem32.
 __emitinline void xLDMXCSR( const ModSib32& src )
 {
 	SimdPrefix( 0, 0xae );
 	EmitSibMagic( 2, src );
 }
 // Save x87 FPU, MMX Technology, and SSE State to buffer
 // Target buffer must be at least 512 bytes in length to hold the result.
 __emitinline void xFXSAVE( const ModSibBase& dest )
 {
 	SimdPrefix( 0, 0xae );
 	EmitSibMagic( 0, dest );
 }
 // Restore x87 FPU, MMX , XMM, and MXCSR State.
 // Source buffer should be 512 bytes in length.
 __emitinline void xFXRSTOR( const ModSibBase& src )
 {
 	SimdPrefix( 0, 0xae );
 	EmitSibMagic( 1, src );
 }
 }