Emitter rewrite, part 1 of 5 (or so...): Re-tooled SSE arithmetic instructions to be class/template free.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2067 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-10-24 19:06:11 +00:00 · 2009-10-24 19:06:11 +00:00 · 6fdaea2c79
parent e8d858c675
commit 6fdaea2c79
8 changed files with 364 additions and 172 deletions
--- a/common/build/x86emitter/x86emitter.vcproj
+++ b/common/build/x86emitter/x86emitter.vcproj
@ -6,6 +6,7 @@
 	ProjectGUID="{A51123F5-9505-4EAE-85E7-D320290A272C}"
 	RootNamespace="x86emitter"
 	Keyword="Win32Proj"
+	TargetFrameworkVersion="0"
 	>
 	<Platforms>
 		<Platform
@ -331,14 +332,6 @@
 			<Filter
 				Name="Implement_Simd"
 				>
-				<File
-					RelativePath="..\..\include\x86emitter\implement\xmm\arithmetic.h"
-					>
-				</File>
-				<File
-					RelativePath="..\..\include\x86emitter\implement\xmm\basehelpers.h"
-					>
-				</File>
 				<File
 					RelativePath="..\..\include\x86emitter\implement\xmm\comparisons.h"
 					>
@ -351,6 +344,18 @@
 					RelativePath="..\..\include\x86emitter\implement\xmm\shufflepack.h"
 					>
 				</File>
+				<File
+					RelativePath="..\..\include\x86emitter\implement\simd_arithmetic.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\include\x86emitter\implement\simd_helpers.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\include\x86emitter\implement\simd_templated_helpers.h"
+					>
+				</File>
 			</Filter>
 		</Filter>
 		<Filter
--- a/common/include/x86emitter/implement/helpers.h
+++ b/common/include/x86emitter/implement/helpers.h
@ -20,9 +20,11 @@
 // that all members contained within are in said namespace.
 // ------------------------------------------------------------------------

-
 #pragma once

+#define OpWriteSSE( pre, op )		xOpWrite0F( pre, op, to, from )
+#define OpWriteMMX( op )			xOpWrite0F( op, to, from )
+
 extern void SimdPrefix( u8 prefix, u16 opcode );
 extern void EmitSibMagic( uint regfield, const void* address );
 extern void EmitSibMagic( uint regfield, const ModSibBase& info );
--- a/common/include/x86emitter/implement/simd_arithmetic.h
+++ b/common/include/x86emitter/implement/simd_arithmetic.h
@ -15,101 +15,81 @@

 #pragma once

-//////////////////////////////////////////////////////////////////////////////////////////
-// ShiftHelper -- It's out here because C++ child class template semantics are generally
-// not cross-compiler friendly.
-//
-template< u16 Opcode1, u16 OpcodeImm, u8 Modcode >
-class _SimdShiftHelper
+namespace x86Emitter {
+
+// --------------------------------------------------------------------------------------
+//  _SimdShiftHelper
+// --------------------------------------------------------------------------------------
+struct _SimdShiftHelper
 {
-public:
-	_SimdShiftHelper() {}
+	u8		Prefix;
+	u16		Opcode;
+	u16		OpcodeImm;
+	u8		Modcode;

-	__forceinline void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const { xOpWrite0F( 0x66, Opcode1, to, from ); }
-	__forceinline void operator()( const xRegisterSSE& to, const ModSibBase& from ) const	{ xOpWrite0F( 0x66, Opcode1, to, from ); }
+	void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const;
+	void operator()( const xRegisterSSE& to, const ModSibBase& from ) const;

-	__forceinline void operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const { xOpWrite0F( Opcode1, to, from ); }
-	__forceinline void operator()( const xRegisterMMX& to, const ModSibBase& from ) const	{ xOpWrite0F( Opcode1, to, from ); }
+	void operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const;
+	void operator()( const xRegisterMMX& to, const ModSibBase& from ) const;

-
-	__emitinline void operator()( const xRegisterSSE& to, u8 imm8 ) const
-	{
-		SimdPrefix( 0x66, OpcodeImm );
-		EmitSibMagic( (int)Modcode, to );
-		xWrite8( imm8 );
-	}
-
-	__emitinline void operator()( const xRegisterMMX& to, u8 imm8 ) const
-	{
-		SimdPrefix( 0x00, OpcodeImm );
-		EmitSibMagic( (int)Modcode, to );
-		xWrite8( imm8 );
-	}
+	void operator()( const xRegisterSSE& to, u8 imm8 ) const;
+	void operator()( const xRegisterMMX& to, u8 imm8 ) const;
 };

-//////////////////////////////////////////////////////////////////////////////////////////
+// --------------------------------------------------------------------------------------
+//  xImplSimd_Shift / xImplSimd_ShiftWithoutQ
+// --------------------------------------------------------------------------------------
+
 // Used for PSRA, which lacks the Q form.
 //
-template< u16 OpcodeBase1, u8 Modcode >
-class SimdImpl_ShiftWithoutQ
+struct xImplSimd_ShiftWithoutQ
 {
-public:
-	const _SimdShiftHelper<OpcodeBase1+1,0x71,Modcode> W;
-	const _SimdShiftHelper<OpcodeBase1+2,0x72,Modcode> D;
-
-	SimdImpl_ShiftWithoutQ() {}
+	const _SimdShiftHelper W;
+	const _SimdShiftHelper D;
 };

-//////////////////////////////////////////////////////////////////////////////////////////
 // Implements PSRL and PSLL
 //
-template< u16 OpcodeBase1, u8 Modcode >
-class SimdImpl_Shift : public SimdImpl_ShiftWithoutQ<OpcodeBase1, Modcode>
+struct xImplSimd_Shift
 {
-public:
-	const _SimdShiftHelper<OpcodeBase1+3,0x73,Modcode> Q;
-	
-	__forceinline void DQ( const xRegisterSSE& to, u8 imm8 ) const		{ xOpWrite0F( 0x66, 0x73, (int)Modcode+1, to, imm8 ); }
-	
-	SimdImpl_Shift() {}
-};
+	const _SimdShiftHelper W;
+	const _SimdShiftHelper D;
+	const _SimdShiftHelper Q;

+	void DQ( const xRegisterSSE& to, u8 imm8 ) const;
+};

 //////////////////////////////////////////////////////////////////////////////////////////
 //
-template< u16 OpcodeB, u16 OpcodeQ >
-class SimdImpl_AddSub
+struct xImplSimd_AddSub
 {
-public:
-	const SimdImpl_DestRegEither<0x66,OpcodeB+0x20> B;
-	const SimdImpl_DestRegEither<0x66,OpcodeB+0x21> W;
-	const SimdImpl_DestRegEither<0x66,OpcodeB+0x22> D;
-	const SimdImpl_DestRegEither<0x66,OpcodeQ> Q;
+	const xImplSimd_DestRegEither B;
+	const xImplSimd_DestRegEither W;
+	const xImplSimd_DestRegEither D;
+	const xImplSimd_DestRegEither Q;

 	// Add/Sub packed signed byte [8bit] integers from src into dest, and saturate the results.
-	const SimdImpl_DestRegEither<0x66,OpcodeB+0x10> SB;
+	const xImplSimd_DestRegEither SB;

 	// Add/Sub packed signed word [16bit] integers from src into dest, and saturate the results.
-	const SimdImpl_DestRegEither<0x66,OpcodeB+0x11> SW;
+	const xImplSimd_DestRegEither SW;

 	// Add/Sub packed unsigned byte [8bit] integers from src into dest, and saturate the results.
-	const SimdImpl_DestRegEither<0x66,OpcodeB> USB;
+	const xImplSimd_DestRegEither USB;

 	// Add/Sub packed unsigned word [16bit] integers from src into dest, and saturate the results.
-	const SimdImpl_DestRegEither<0x66,OpcodeB+1> USW;
-
-	SimdImpl_AddSub() {}
+	const xImplSimd_DestRegEither USW;
 };

 //////////////////////////////////////////////////////////////////////////////////////////
 //
-class SimdImpl_PMul
+struct xImplSimd_PMul
 {
-public:
-	const SimdImpl_DestRegEither<0x66,0xd5> LW;
-	const SimdImpl_DestRegEither<0x66,0xe5> HW;
-	const SimdImpl_DestRegEither<0x66,0xe4> HUW;
-	const SimdImpl_DestRegEither<0x66,0xf4> UDQ;
+	const xImplSimd_DestRegEither LW;
+	const xImplSimd_DestRegEither HW;
+	const xImplSimd_DestRegEither HUW;
+	const xImplSimd_DestRegEither UDQ;

 	// [SSE-3] PMULHRSW multiplies vertically each signed 16-bit integer from dest with the
 	// corresponding signed 16-bit integer of source, producing intermediate signed 32-bit
@ -121,112 +101,95 @@ public:
 	//
 	// Both operands can be MMX or XMM registers.  Source can be register or memory.
 	//
-	const SimdImpl_DestRegEither<0x66,0x0b38> HRSW;
+	const xImplSimd_DestRegEither HRSW;
 	
 	// [SSE-4.1] Multiply the packed dword signed integers in dest with src, and store
 	// the low 32 bits of each product in xmm1.
-	const SimdImpl_DestRegSSE<0x66,0x4038> LD;
+	const xImplSimd_DestRegSSE LD;
 	
 	// [SSE-4.1] Multiply the packed signed dword integers in dest with src.
-	const SimdImpl_DestRegSSE<0x66,0x2838> DQ;
-	
-	SimdImpl_PMul() {}
+	const xImplSimd_DestRegSSE DQ;
 };

 //////////////////////////////////////////////////////////////////////////////////////////
 // For instructions that have PS/SS form only (most commonly reciprocal Sqrt functions)
 //
-template< u16 OpcodeSSE >
-class SimdImpl_rSqrt
+struct xImplSimd_rSqrt
 {
-public:
-	const SimdImpl_DestRegSSE<0x00,OpcodeSSE> PS;
-	const SimdImpl_DestRegSSE<0xf3,OpcodeSSE> SS;
-	SimdImpl_rSqrt() {}
+	const xImplSimd_DestRegSSE PS;
+	const xImplSimd_DestRegSSE SS;
 };

 //////////////////////////////////////////////////////////////////////////////////////////
 // SQRT has PS/SS/SD forms, but not the PD form.
 //
-template< u16 OpcodeSSE >
-class SimdImpl_Sqrt : public SimdImpl_rSqrt<OpcodeSSE>
+struct xImplSimd_Sqrt
 {
-public:
-	SimdImpl_Sqrt() {}
-	const SimdImpl_DestRegSSE<0xf2,OpcodeSSE> SD;
+	const xImplSimd_DestRegSSE PS;
+	const xImplSimd_DestRegSSE SS;
+	const xImplSimd_DestRegSSE SD;
 };

 //////////////////////////////////////////////////////////////////////////////////////////
 //
-class SimdImpl_AndNot
+struct xImplSimd_AndNot
 {
-public:
-	SimdImpl_AndNot() {}
-	const SimdImpl_DestRegSSE<0x00,0x55> PS;
-	const SimdImpl_DestRegSSE<0x66,0x55> PD;
+	const xImplSimd_DestRegSSE PS;
+	const xImplSimd_DestRegSSE PD;
 };

 //////////////////////////////////////////////////////////////////////////////////////////
 // Packed absolute value. [sSSE3 only]
 //
-class SimdImpl_PAbsolute
+struct xImplSimd_PAbsolute
 {
-public:
-	SimdImpl_PAbsolute() {}
-	
 	// [sSSE-3] Computes the absolute value of bytes in the src, and stores the result
 	// in dest, as UNSIGNED.
-	const SimdImpl_DestRegEither<0x66, 0x1c38> B;
+	const xImplSimd_DestRegEither B;

 	// [sSSE-3] Computes the absolute value of word in the src, and stores the result
 	// in dest, as UNSIGNED.
-	const SimdImpl_DestRegEither<0x66, 0x1d38> W;
+	const xImplSimd_DestRegEither W;

 	// [sSSE-3] Computes the absolute value of doublewords in the src, and stores the
 	// result in dest, as UNSIGNED.
-	const SimdImpl_DestRegEither<0x66, 0x1e38> D;
+	const xImplSimd_DestRegEither D;
 };

 //////////////////////////////////////////////////////////////////////////////////////////
 // Packed Sign [sSSE3 only] - Negate/zero/preserve packed integers in dest depending on the
 // corresponding sign in src.
 //
-class SimdImpl_PSign
+struct xImplSimd_PSign
 {
-public:
-	SimdImpl_PSign() {}
-
 	// [sSSE-3] negates each byte element of dest if the signed integer value of the
 	// corresponding data element in src is less than zero. If the signed integer value
 	// of a data element in src is positive, the corresponding data element in dest is
 	// unchanged. If a data element in src is zero, the corresponding data element in
 	// dest is set to zero.
-	const SimdImpl_DestRegEither<0x66, 0x0838> B;
+	const xImplSimd_DestRegEither B;

 	// [sSSE-3] negates each word element of dest if the signed integer value of the
 	// corresponding data element in src is less than zero. If the signed integer value
 	// of a data element in src is positive, the corresponding data element in dest is
 	// unchanged. If a data element in src is zero, the corresponding data element in
 	// dest is set to zero.
-	const SimdImpl_DestRegEither<0x66, 0x0938> W;
+	const xImplSimd_DestRegEither W;

 	// [sSSE-3] negates each doubleword element of dest if the signed integer value
 	// of the corresponding data element in src is less than zero. If the signed integer
 	// value of a data element in src is positive, the corresponding data element in dest
 	// is unchanged. If a data element in src is zero, the corresponding data element in
 	// dest is set to zero.
-	const SimdImpl_DestRegEither<0x66, 0x0a38> D;
+	const xImplSimd_DestRegEither D;

 };

 //////////////////////////////////////////////////////////////////////////////////////////
 // Packed Multiply and Add!!
 //
-class SimdImpl_PMultAdd
+struct xImplSimd_PMultAdd
 {
-public:
-	SimdImpl_PMultAdd() {}
-
 	// Multiplies the individual signed words of dest by the corresponding signed words
 	// of src, producing temporary signed, doubleword results. The adjacent doubleword
 	// results are then summed and stored in the destination operand.
@ -235,7 +198,7 @@ public:
 	//   DEST[63:32] = ( DEST[47:32] * SRC[47:32]) + (DEST[63:48] * SRC[63:48] );
 	//   [.. repeat in the case of XMM src/dest operands ..]
 	//
-	const SimdImpl_DestRegEither<0x66, 0xf5> WD;
+	const xImplSimd_DestRegEither WD;

 	// [sSSE-3] multiplies vertically each unsigned byte of dest with the corresponding
 	// signed byte of src, producing intermediate signed 16-bit integers. Each adjacent
@ -251,17 +214,14 @@ public:
 	//   DEST[31-16] = SaturateToSignedWord( SRC[31-24] * DEST[31-24] + SRC[23-16] * DEST[23-16] );
 	//   [.. repeat for each 16 bits up to 64 (mmx) or 128 (xmm) ..]
 	//
-	const SimdImpl_DestRegEither<0x66, 0xf438> UBSW;
+	const xImplSimd_DestRegEither UBSW;
 };

 //////////////////////////////////////////////////////////////////////////////////////////
 // Packed Horizontal Add [SSE3 only]
 //
-class SimdImpl_HorizAdd
+struct xImplSimd_HorizAdd
 {
-public:
-	SimdImpl_HorizAdd() {}
-	
 	// [SSE-3] Horizontal Add of Packed Data.  A three step process:
 	// * Adds the single-precision floating-point values in the first and second dwords of
 	//   dest and stores the result in the first dword of dest.
@ -269,24 +229,21 @@ public:
 	//   stores the result in the second dword of dest.
 	// * Adds single-precision floating-point values in the first and second dword of *src*
 	//   and stores the result in the third dword of dest.
-	const SimdImpl_DestRegSSE<0xf2, 0x7c> PS;
+	const xImplSimd_DestRegSSE PS;
 	
 	// [SSE-3] Horizontal Add of Packed Data.  A two step process:
 	// * Adds the double-precision floating-point values in the high and low quadwords of
 	//   dest and stores the result in the low quadword of dest.
 	// * Adds the double-precision floating-point values in the high and low quadwords of
 	//   *src* stores the result in the high quadword of dest.
-	const SimdImpl_DestRegSSE<0x66, 0x7c> PD;
+	const xImplSimd_DestRegSSE PD;
 };

 //////////////////////////////////////////////////////////////////////////////////////////
 // DotProduct calculation (SSE4.1 only!)
 //
-class SimdImpl_DotProduct
+struct xImplSimd_DotProduct
 {
-public:
-	SimdImpl_DotProduct() {}
-
 	// [SSE-4.1] Conditionally multiplies the packed single precision floating-point
 	// values in dest with the packed single-precision floats in src depending on a
 	// mask extracted from the high 4 bits of the immediate byte. If a condition mask
@ -300,20 +257,17 @@ public:
 	// element in dest.  If a broadcast mask bit is zero, the corresponding element in
 	// the destination is set to zero.
 	//
-	SimdImpl_DestRegImmSSE<0x66,0x403a> PS;
+	xImplSimd_DestRegImmSSE PS;

 	// [SSE-4.1]
-	SimdImpl_DestRegImmSSE<0x66,0x413a> PD;
+	xImplSimd_DestRegImmSSE PD;
 };

 //////////////////////////////////////////////////////////////////////////////////////////
 // Rounds floating point values (packed or single scalar) by an arbitrary rounding mode.
 // (SSE4.1 only!)
-class SimdImpl_Round
+struct xImplSimd_Round
 {
-public:
-	SimdImpl_Round() {}
-
 	// [SSE-4.1] Rounds the 4 packed single-precision src values and stores them in dest.
 	//
 	// Imm8 specifies control fields for the rounding operation:
@ -324,7 +278,7 @@ public:
 	// Rounding Mode Reference:
 	//   0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
 	//
-	const SimdImpl_DestRegImmSSE<0x66,0x083a> PS;
+	const xImplSimd_DestRegImmSSE PS;

 	// [SSE-4.1] Rounds the 2 packed double-precision src values and stores them in dest.
 	//
@ -336,7 +290,7 @@ public:
 	// Rounding Mode Reference:
 	//   0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
 	//
-	const SimdImpl_DestRegImmSSE<0x66,0x093a> PD;
+	const xImplSimd_DestRegImmSSE PD;

 	// [SSE-4.1] Rounds the single-precision src value and stores in dest.
 	//
@ -348,7 +302,7 @@ public:
 	// Rounding Mode Reference:
 	//   0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
 	//
-	const SimdImpl_DestRegImmSSE<0x66,0x0a3a> SS;
+	const xImplSimd_DestRegImmSSE SS;

 	// [SSE-4.1] Rounds the double-precision src value and stores in dest.
 	//
@ -360,5 +314,8 @@ public:
 	// Rounding Mode Reference:
 	//   0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
 	//
-	const SimdImpl_DestRegImmSSE<0x66,0x0b3a> SD;
+	const xImplSimd_DestRegImmSSE SD;
 };
+
+}	// End namespace x86Emitter
+
--- a/common/include/x86emitter/implement/simd_helpers.h
+++ b/common/include/x86emitter/implement/simd_helpers.h
@ -0,0 +1,76 @@
+/*  PCSX2 - PS2 Emulator for PCs
+ *  Copyright (C) 2002-2009  PCSX2 Dev Team
+ * 
+ *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ *  of the GNU Lesser General Public License as published by the Free Software Found-
+ *  ation, either version 3 of the License, or (at your option) any later version.
+ *
+ *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ *  PURPOSE.  See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along with PCSX2.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace x86Emitter {
+
+// =====================================================================================================
+//  xImpl_SIMD Types (template free!)
+// =====================================================================================================
+
+// ------------------------------------------------------------------------
+// For implementing SSE-only logic operations that have xmmreg,xmmreg/rm forms only,
+// like ANDPS/ANDPD
+//
+struct xImplSimd_DestRegSSE
+{
+	u8		Prefix;
+	u16		Opcode;
+	
+	void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const;
+	void operator()( const xRegisterSSE& to, const ModSibBase& from ) const;
+};
+
+// ------------------------------------------------------------------------
+// For implementing SSE-only logic operations that have xmmreg,reg/rm,imm forms only
+// (PSHUFD / PSHUFHW / etc).
+//
+struct xImplSimd_DestRegImmSSE
+{
+	u8		Prefix;
+	u16		Opcode;
+
+	void operator()( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const;
+	void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm ) const;
+};
+
+struct xImplSimd_DestRegImmMMX
+{
+	u8		Prefix;
+	u16		Opcode;
+
+	void operator()( const xRegisterMMX& to, const xRegisterMMX& from, u8 imm ) const;
+	void operator()( const xRegisterMMX& to, const ModSibBase& from, u8 imm ) const;
+};
+
+// ------------------------------------------------------------------------
+// For implementing MMX/SSE operations that have reg,reg/rm forms only,
+// but accept either MM or XMM destinations (most PADD/PSUB and other P arithmetic ops).
+//
+struct xImplSimd_DestRegEither
+{
+	u8		Prefix;
+	u16		Opcode;
+
+	void operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const;
+	void operator()( const xRegisterSSE& to, const ModSibBase& from ) const;
+
+	void operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const;
+	void operator()( const xRegisterMMX& to, const ModSibBase& from ) const;
+};
+
+}	// end namespace x86Emitter
+
--- a/common/include/x86emitter/implement/simd_templated_helpers.h
+++ b/common/include/x86emitter/implement/simd_templated_helpers.h
--- a/common/include/x86emitter/instructions.h
+++ b/common/include/x86emitter/instructions.h
@ -428,13 +428,9 @@ namespace x86Emitter
 	extern const Internal::SimdImpl_DestRegEither<0x66,0xeb> xPOR;
 	extern const Internal::SimdImpl_DestRegEither<0x66,0xef> xPXOR;

-	extern const Internal::SimdImpl_AndNot			xANDN;

 	extern const Internal::SimdImpl_COMI<true>		xCOMI;
 	extern const Internal::SimdImpl_COMI<false>		xUCOMI;
-	extern const Internal::SimdImpl_rSqrt<0x53>		xRCP;
-	extern const Internal::SimdImpl_rSqrt<0x52>		xRSQRT;
-	extern const Internal::SimdImpl_Sqrt<0x51>		xSQRT;

 	extern const Internal::SimdImpl_MinMax<0x5f>	xMAX;
 	extern const Internal::SimdImpl_MinMax<0x5d>	xMIN;
@ -488,32 +484,36 @@ namespace x86Emitter

 	// ------------------------------------------------------------------------

-	extern const Internal::SimdImpl_Shift<0xf0, 6>		xPSLL;
-	extern const Internal::SimdImpl_Shift<0xd0, 2>		xPSRL;
-	extern const Internal::SimdImpl_ShiftWithoutQ<0xe0, 4> xPSRA;
+	extern const xImplSimd_AndNot			xANDN;
+	extern const xImplSimd_rSqrt			xRCP;
+	extern const xImplSimd_rSqrt			xRSQRT;
+	extern const xImplSimd_Sqrt				xSQRT;
+
+	extern const xImplSimd_Shift			xPSLL;
+	extern const xImplSimd_Shift			xPSRL;
+	extern const xImplSimd_ShiftWithoutQ	xPSRA;
+	extern const xImplSimd_AddSub			xPADD;
+	extern const xImplSimd_AddSub			xPSUB;
+	extern const xImplSimd_PMul				xPMUL;
+	extern const xImplSimd_PAbsolute		xPABS;
+	extern const xImplSimd_PSign			xPSIGN;
+	extern const xImplSimd_PMultAdd			xPMADD;
+	extern const xImplSimd_HorizAdd			xHADD;
+	extern const xImplSimd_DotProduct		xDP;
+	extern const xImplSimd_Round			xROUND;
+

-	extern const Internal::SimdImpl_AddSub<0xdc, 0xd4>	xPADD;
-	extern const Internal::SimdImpl_AddSub<0xd8, 0xfb>	xPSUB;
 	extern const Internal::SimdImpl_PMinMax<0xde,0x3c>	xPMAX;
 	extern const Internal::SimdImpl_PMinMax<0xda,0x38>	xPMIN;

-	extern const Internal::SimdImpl_PMul				xPMUL;
 	extern const Internal::SimdImpl_PCompare			xPCMP;
 	extern const Internal::SimdImpl_PShuffle			xPSHUF;
 	extern const Internal::SimdImpl_PUnpack				xPUNPCK;
 	extern const Internal::SimdImpl_Unpack				xUNPCK;
 	extern const Internal::SimdImpl_Pack				xPACK;
-
-	extern const Internal::SimdImpl_PAbsolute			xPABS;
-	extern const Internal::SimdImpl_PSign				xPSIGN;
 	extern const Internal::SimdImpl_PInsert				xPINSR;
 	extern const Internal::SimdImpl_PExtract			xPEXTR;
-	extern const Internal::SimdImpl_PMultAdd			xPMADD;
-	extern const Internal::SimdImpl_HorizAdd			xHADD;
-
 	extern const Internal::SimdImpl_Blend				xBLEND;
-	extern const Internal::SimdImpl_DotProduct			xDP;
-	extern const Internal::SimdImpl_Round				xROUND;

 	extern const Internal::SimdImpl_PMove<true>			xPMOVSX;
 	extern const Internal::SimdImpl_PMove<false>		xPMOVZX;
--- a/common/include/x86emitter/x86types.h
+++ b/common/include/x86emitter/x86types.h
@ -694,9 +694,8 @@ __forceinline void xWrite( T val )
 	namespace Internal
 	{
 		#include "implement/helpers.h"
-		#include "implement/xmm/basehelpers.h"
+		#include "implement/simd_templated_helpers.h"
 		#include "implement/xmm/moremovs.h"
-		#include "implement/xmm/arithmetic.h"
 		#include "implement/xmm/comparisons.h"
 		#include "implement/xmm/shufflepack.h"
 		#include "implement/group1.h"
@ -730,4 +729,7 @@ __forceinline void xWrite( T val )
 	}
 }

+#include "implement/simd_helpers.h"
+#include "implement/simd_arithmetic.h"
+
 #include "inlines.inl"
--- a/common/src/x86emitter/simd.cpp
+++ b/common/src/x86emitter/simd.cpp
@ -90,12 +90,8 @@ const MovhlImplAll<0x12>		xMOVL;
 const MovhlImpl_RtoR<0x16>		xMOVLH;
 const MovhlImpl_RtoR<0x12>		xMOVHL;

-const SimdImpl_AndNot			xANDN;
 const SimdImpl_COMI<true>		xCOMI;
 const SimdImpl_COMI<false>		xUCOMI;
-const SimdImpl_rSqrt<0x53>		xRCP;
-const SimdImpl_rSqrt<0x52>		xRSQRT;
-const SimdImpl_Sqrt<0x51>		xSQRT;

 const SimdImpl_MinMax<0x5f>		xMAX;
 const SimdImpl_MinMax<0x5d>		xMIN;
@ -160,32 +156,186 @@ const SimdImpl_DestRegStrict<0xf3,0x2c,xRegister32, xRegisterSSE,u32>		xCVTTSS2S

 // ------------------------------------------------------------------------

-const SimdImpl_Shift<0xd0, 2> xPSRL;
-const SimdImpl_Shift<0xf0, 6> xPSLL;
-const SimdImpl_ShiftWithoutQ<0xe0, 4> xPSRA;
+void xImplSimd_DestRegSSE::operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const				{ OpWriteSSE( Prefix, Opcode ); }
+void xImplSimd_DestRegSSE::operator()( const xRegisterSSE& to, const ModSibBase& from ) const				{ OpWriteSSE( Prefix, Opcode ); }
+
+void xImplSimd_DestRegImmSSE::operator()( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm ) const	{ xOpWrite0F( Prefix, Opcode, to, from, imm ); }
+void xImplSimd_DestRegImmSSE::operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm ) const	{ xOpWrite0F( Prefix, Opcode, to, from, imm ); }
+
+void xImplSimd_DestRegImmMMX::operator()( const xRegisterMMX& to, const xRegisterMMX& from, u8 imm ) const	{ xOpWrite0F( Opcode, to, from, imm ); }
+void xImplSimd_DestRegImmMMX::operator()( const xRegisterMMX& to, const ModSibBase& from, u8 imm ) const	{ xOpWrite0F( Opcode, to, from, imm ); }
+
+void xImplSimd_DestRegEither::operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const			{ OpWriteSSE( Prefix, Opcode ); }
+void xImplSimd_DestRegEither::operator()( const xRegisterSSE& to, const ModSibBase& from ) const			{ OpWriteSSE( Prefix, Opcode ); }
+
+void xImplSimd_DestRegEither::operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const			{ OpWriteMMX( Opcode ); }
+void xImplSimd_DestRegEither::operator()( const xRegisterMMX& to, const ModSibBase& from ) const			{ OpWriteMMX( Opcode ); }
+
+// =====================================================================================================
+//  SIMD Arithmetic Instructions
+// =====================================================================================================
+
+void _SimdShiftHelper::operator()( const xRegisterSSE& to, const xRegisterSSE& from ) const			{ OpWriteSSE( Prefix, Opcode ); }
+void _SimdShiftHelper::operator()( const xRegisterSSE& to, const ModSibBase& from ) const			{ OpWriteSSE( Prefix, Opcode ); }
+
+void _SimdShiftHelper::operator()( const xRegisterMMX& to, const xRegisterMMX& from ) const			{ OpWriteMMX( Opcode ); }
+void _SimdShiftHelper::operator()( const xRegisterMMX& to, const ModSibBase& from ) const			{ OpWriteMMX( Opcode ); }
+
+void _SimdShiftHelper::operator()( const xRegisterSSE& to, u8 imm8 ) const
+{
+	SimdPrefix( 0x66, OpcodeImm );
+	EmitSibMagic( (int)Modcode, to );
+	xWrite8( imm8 );
+}
+
+void _SimdShiftHelper::operator()( const xRegisterMMX& to, u8 imm8 ) const
+{
+	SimdPrefix( 0x00, OpcodeImm );
+	EmitSibMagic( (int)Modcode, to );
+	xWrite8( imm8 );
+}
+
+void xImplSimd_Shift::DQ( const xRegisterSSE& to, u8 imm8 ) const
+{
+	xOpWrite0F( 0x66, 0x73, (int)Q.Modcode+1, to, imm8 );
+}
+
+
+const xImplSimd_ShiftWithoutQ xPSRA =
+{
+	{ 0x66, 0xe1, 0x71, 4 },	// W
+	{ 0x66, 0xe2, 0x72, 4 }		// D
+};
+
+const xImplSimd_Shift xPSRL =
+{
+	{ 0x66, 0xd1, 0x71, 2 },	// W
+	{ 0x66, 0xd2, 0x72, 2 },	// D
+	{ 0x66, 0xd3, 0x73, 2 },	// Q
+};
+
+const xImplSimd_Shift xPSLL =
+{
+	{ 0x66, 0xf1, 0x71, 6 },	// W
+	{ 0x66, 0xf2, 0x72, 6 },	// D
+	{ 0x66, 0xf3, 0x73, 6 },	// Q
+};
+
+
+const xImplSimd_AddSub xPADD =
+{
+	{ 0x66, 0xdc+0x20 },	// B
+	{ 0x66, 0xdc+0x21 },	// W
+	{ 0x66, 0xdc+0x22 },	// D
+	{ 0x66, 0xd4 },			// Q
+
+	{ 0x66, 0xdc+0x10 },	// SB
+	{ 0x66, 0xdc+0x11 },	// SW
+	{ 0x66, 0xdc },			// USB
+	{ 0x66, 0xdc+1 },		// USW
+};
+
+const xImplSimd_AddSub xPSUB =
+{
+	{ 0x66, 0xd8+0x20 },	// B
+	{ 0x66, 0xd8+0x21 },	// W
+	{ 0x66, 0xd8+0x22 },	// D
+	{ 0x66, 0xfb },			// Q
+
+	{ 0x66, 0xd8+0x10 },	// SB
+	{ 0x66, 0xd8+0x11 },	// SW
+	{ 0x66, 0xd8 },			// USB
+	{ 0x66, 0xd8+1 },		// USW
+};
+
+
+const xImplSimd_PMul xPMUL =
+{
+	{ 0x66, 0xd5 },		// LW
+	{ 0x66, 0xe5 },		// HW
+	{ 0x66, 0xe4 },		// HUW
+	{ 0x66, 0xf4 },		// UDQ
+
+	{ 0x66, 0x0b38 },	// HRSW
+	{ 0x66, 0x4038 },	// LD
+	{ 0x66, 0x2838 },	// DQ
+};
+
+const xImplSimd_rSqrt xRSQRT =
+{
+	{ 0x00, 0x52 },		// PS
+	{ 0xf3, 0x52 }		// SS
+};
+
+const xImplSimd_rSqrt xRCP =
+{
+	{ 0x00, 0x53 },		// PS
+	{ 0xf3, 0x53 }		// SS
+};
+
+const xImplSimd_Sqrt xSQRT =
+{
+	{ 0x00, 0x51 },		// PS
+	{ 0xf3, 0x51 },		// SS
+	{ 0xf2, 0x51 }		// SS
+};
+
+const xImplSimd_AndNot xANDN =
+{
+	{ 0x00, 0x55 },		// PS
+	{ 0x66, 0x55 }		// PD
+};
+
+const xImplSimd_PAbsolute xPABS = 
+{
+	{ 0x66, 0x1c38 },	// B
+	{ 0x66, 0x1d38 },	// W
+	{ 0x66, 0x1e38 }	// D
+};
+
+const xImplSimd_PSign xPSIGN =
+{
+	{ 0x66, 0x0838 },	// B
+	{ 0x66, 0x0938 },	// W
+	{ 0x66, 0x0a38 },	// D
+};
+
+const xImplSimd_PMultAdd xPMADD =
+{
+	{ 0x66, 0xf5 },		// WD
+	{ 0x66, 0xf438 },	// UBSW
+};
+
+const xImplSimd_HorizAdd xHADD =
+{
+	{ 0xf2, 0x7c },		// PS
+	{ 0x66, 0x7c },		// PD
+};
+
+const xImplSimd_DotProduct xDP =
+{
+	{ 0x66,0x403a },	// PS
+	{ 0x66,0x413a },	// PD
+};
+
+const xImplSimd_Round xROUND =
+{
+	{ 0x66,0x083a },	// PS
+	{ 0x66,0x093a },	// PD
+	{ 0x66,0x0a3a },	// SS
+	{ 0x66,0x0b3a },	// SD
+};

-const SimdImpl_AddSub<0xdc, 0xd4> xPADD;
-const SimdImpl_AddSub<0xd8, 0xfb> xPSUB;
 const SimdImpl_PMinMax<0xde,0x3c> xPMAX;
 const SimdImpl_PMinMax<0xda,0x38> xPMIN;
-
-const SimdImpl_PMul xPMUL;
 const SimdImpl_PCompare xPCMP;
 const SimdImpl_PShuffle xPSHUF;
 const SimdImpl_PUnpack xPUNPCK;
 const SimdImpl_Unpack xUNPCK;
 const SimdImpl_Pack xPACK;
-
-const SimdImpl_PAbsolute xPABS;
-const SimdImpl_PSign xPSIGN;
 const SimdImpl_PInsert xPINSR;
 const SimdImpl_PExtract xPEXTR;
-const SimdImpl_PMultAdd xPMADD;
-const SimdImpl_HorizAdd xHADD;
-
 const SimdImpl_Blend xBLEND;
-const SimdImpl_DotProduct xDP;
-const SimdImpl_Round xROUND;

 const SimdImpl_PMove<true> xPMOVSX;
 const SimdImpl_PMove<false> xPMOVZX;