Added PMADD/HADD/INSERT/EXTRACT instructions, and fixed more cross-compiler problems.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1038 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-04-21 21:30:47 +00:00 · 2009-04-21 21:30:47 +00:00 · 27a8f3aa9a
parent 2dcee32079
commit 27a8f3aa9a
7 changed files with 206 additions and 82 deletions
--- a/pcsx2/x86/ix86/implement/xmm/arithmetic.h
+++ b/pcsx2/x86/ix86/implement/xmm/arithmetic.h
@ -18,49 +18,52 @@

 #pragma once

+//////////////////////////////////////////////////////////////////////////////////////////
+// ShiftHelper -- It's out here because C++ child class template semantics are generally
+// not cross-compiler friendly.
+//
+template< u16 Opcode1, u16 OpcodeImm, u8 Modcode >
+class _SimdShiftHelper
+{
+public:
+	_SimdShiftHelper() {}
+
+	template< typename OperandType >
+	__forceinline void operator()( const xRegisterSIMD<OperandType>& to, const xRegisterSIMD<OperandType>& from ) const
+	{
+		writeXMMop( 0x66, Opcode1, to, from );
+	}
+
+	template< typename OperandType >
+	__forceinline void operator()( const xRegisterSIMD<OperandType>& to, const void* from ) const
+	{
+		writeXMMop( 0x66, Opcode1, to, from );
+	}
+
+	template< typename OperandType >
+	__noinline void operator()( const xRegisterSIMD<OperandType>& to, const ModSibBase& from ) const
+	{
+		writeXMMop( 0x66, Opcode1, to, from );
+	}
+
+	template< typename OperandType >
+	__emitinline void operator()( const xRegisterSIMD<OperandType>& to, u8 imm8 ) const
+	{
+		SimdPrefix( (sizeof( OperandType ) == 16) ? 0x66 : 0, OpcodeImm );
+		ModRM( 3, (int)Modcode, to.Id );
+		xWrite<u8>( imm8 );
+	}
+};
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // Used for PSRA, which lacks the Q form.
 //
 template< u16 OpcodeBase1, u8 Modcode >
 class SimdImpl_ShiftWithoutQ
 {
-protected:
-	template< u16 Opcode1, u16 OpcodeImm >
-	class ShiftHelper
-	{
-	public:
-		ShiftHelper() {}
-
-		template< typename OperandType >
-		__forceinline void operator()( const xRegisterSIMD<OperandType>& to, const xRegisterSIMD<OperandType>& from ) const
-		{
-			writeXMMop( 0x66, Opcode1, to, from );
-		}
-
-		template< typename OperandType >
-		__forceinline void operator()( const xRegisterSIMD<OperandType>& to, const void* from ) const
-		{
-			writeXMMop( 0x66, Opcode1, to, from );
-		}
-
-		template< typename OperandType >
-		__noinline void operator()( const xRegisterSIMD<OperandType>& to, const ModSibBase& from ) const
-		{
-			writeXMMop( 0x66, Opcode1, to, from );
-		}
-
-		template< typename OperandType >
-		__emitinline void operator()( const xRegisterSIMD<OperandType>& to, u8 imm8 ) const
-		{
-			SimdPrefix( (sizeof( OperandType ) == 16) ? 0x66 : 0, OpcodeImm );
-			ModRM( 3, (int)Modcode, to.Id );
-			xWrite<u8>( imm8 );
-		}
-	};
-
 public:
-	const ShiftHelper<OpcodeBase1+1,0x71> W;
-	const ShiftHelper<OpcodeBase1+2,0x72> D;
+	const _SimdShiftHelper<OpcodeBase1+1,0x71,Modcode> W;
+	const _SimdShiftHelper<OpcodeBase1+2,0x72,Modcode> D;

 	SimdImpl_ShiftWithoutQ() {}
 };
@ -72,7 +75,7 @@ template< u16 OpcodeBase1, u8 Modcode >
 class SimdImpl_Shift : public SimdImpl_ShiftWithoutQ<OpcodeBase1, Modcode>
 {
 public:
-	const ShiftHelper<OpcodeBase1+3,0x73> Q;
+	const _SimdShiftHelper<OpcodeBase1+3,0x73,Modcode> Q;
 	
 	void DQ( const xRegisterSSE& to, u8 imm ) const
 	{
@ -228,3 +231,63 @@ public:
 	const SimdImpl_DestRegEither<0x66, 0x0a38> D;

 };
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Packed Multiply and Add!!
+//
+class SimdImpl_PMultAdd
+{
+public:
+	SimdImpl_PMultAdd() {}
+
+	// Multiplies the individual signed words of dest by the corresponding signed words
+	// of src, producing temporary signed, doubleword results. The adjacent doubleword
+	// results are then summed and stored in the destination operand.
+	//
+	//   DEST[31:0]  = ( DEST[15:0]  * SRC[15:0])  + (DEST[31:16] * SRC[31:16] );
+	//   DEST[63:32] = ( DEST[47:32] * SRC[47:32]) + (DEST[63:48] * SRC[63:48] );
+	//   [.. repeat in the case of XMM src/dest operands ..]
+	//
+	const SimdImpl_DestRegEither<0x66, 0xf5> WD;
+
+	// [sSSE-3] multiplies vertically each unsigned byte of dest with the corresponding
+	// signed byte of src, producing intermediate signed 16-bit integers. Each adjacent
+	// pair of signed words is added and the saturated result is packed to dest.
+	// For example, the lowest-order bytes (bits 7-0) in src and dest are multiplied
+	// and the intermediate signed word result is added with the corresponding
+	// intermediate result from the 2nd lowest-order bytes (bits 15-8) of the operands;
+	// the sign-saturated result is stored in the lowest word of dest (bits 15-0).
+	// The same operation is performed on the other pairs of adjacent bytes.
+	//
+	// In Coder Speak:
+	//   DEST[15-0]  = SaturateToSignedWord( SRC[15-8]  * DEST[15-8]  + SRC[7-0]   * DEST[7-0]   );
+	//   DEST[31-16] = SaturateToSignedWord( SRC[31-24] * DEST[31-24] + SRC[23-16] * DEST[23-16] );
+	//   [.. repeat for each 16 bits up to 64 (mmx) or 128 (xmm) ..]
+	//
+	const SimdImpl_DestRegEither<0x66, 0xf438> UBSW;
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Packed Horizontal Add [SSE3 only]
+//
+class SimdImpl_HorizAdd
+{
+public:
+	SimdImpl_HorizAdd() {}
+	
+	// [SSE-3] Horizontal Add of Packed Data.  A three step process:
+	// * Adds the single-precision floating-point values in the first and second dwords of
+	//   dest and stores the result in the first dword of dest.
+	// * Adds single-precision floating-point values in the third and fourth dword of dest
+	//   stores the result in the second dword of dest.
+	// * Adds single-precision floating-point values in the first and second dword of *src*
+	//   and stores the result in the third dword of dest.
+	const SimdImpl_DestRegSSE<0xf2, 0x7c> PS;
+	
+	// [SSE-3] Horizontal Add of Packed Data.  A two step process:
+	// * Adds the double-precision floating-point values in the high and low quadwords of
+	//   dest and stores the result in the low quadword of dest.
+	// * Adds the double-precision floating-point values in the high and low quadwords of
+	//   *src* stores the result in the high quadword of dest.
+	const SimdImpl_DestRegSSE<0x66, 0x7c> PD;
+};
--- a/pcsx2/x86/ix86/implement/xmm/moremovs.h
+++ b/pcsx2/x86/ix86/implement/xmm/moremovs.h
@ -80,3 +80,14 @@ public:
 	const SimdImpl_DestRegSSE<AltPrefix,OpcodeSSE> SD;
 	SimdImpl_UcomI() {}
 };
+
+//////////////////////////////////////////////////////////////////////////////////////////
+//
+class SimdImpl_Blend
+{
+	SimdImpl_DestRegImmSSE<0x66,0x0c3a> PS;
+	SimdImpl_DestRegImmSSE<0x66,0x0d3a> PD;
+	
+	SimdImpl_DestRegImmSSE<0x66,0x1438> VPS;
+	SimdImpl_DestRegImmSSE<0x66,0x1538> VPD;
+};
--- a/pcsx2/x86/ix86/implement/xmm/shufflepack.h
+++ b/pcsx2/x86/ix86/implement/xmm/shufflepack.h
@ -169,7 +169,7 @@ public:
 };

 //////////////////////////////////////////////////////////////////////////////////////////
-// PINSW/B/D [all but Word form are SSE4.1 only!]
+// PINSRW/B/D [all but Word form are SSE4.1 only!]
 //
 class SimdImpl_PInsert
 {
@ -183,19 +183,19 @@ protected:
 		__forceinline void operator()( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const
 		{
 			writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
-			xWrite<u8>( imm );
+			xWrite<u8>( imm8 );
 		}

 		__forceinline void operator()( const xRegisterSSE& to, const void* from, u8 imm8 ) const
 		{
 			writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
-			xWrite<u8>( imm );
+			xWrite<u8>( imm8 );
 		}

 		__noinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const
 		{
 			writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
-			xWrite<u8>( imm );
+			xWrite<u8>( imm8 );
 		}
 	};
 	
@ -256,13 +256,13 @@ protected:

 		__forceinline void operator()( void* dest, const xRegisterSSE& from, u8 imm8 ) const
 		{
-			writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
+			writeXMMop( 0x66, (Opcode<<8) | 0x3a, from, dest );
 			xWrite<u8>( imm8 );
 		}

 		__noinline void operator()( const ModSibBase& dest, const xRegisterSSE& from, u8 imm8 ) const
 		{
-			writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
+			writeXMMop( 0x66, (Opcode<<8) | 0x3a, from, dest );
 			xWrite<u8>( imm8 );
 		}
 	};
@ -304,3 +304,4 @@ public:
 	// used to extract any single packed dword value from src into an x86 32 bit register.
 	const ByteDwordForms<0x16> D;
 };
+
--- a/pcsx2/x86/ix86/ix86.cpp
+++ b/pcsx2/x86/ix86/ix86.cpp
@ -784,13 +784,20 @@ const SimdImpl_Pack xPACK;

 const SimdImpl_PAbsolute xPABS;
 const SimdImpl_PSign xPSIGN;
-const SimdImpl_PInsert xPINS;
+const SimdImpl_PInsert xPINSR;
 const SimdImpl_PExtract xPEXTR;
+const SimdImpl_PMultAdd xPMADD;
+const SimdImpl_HorizAdd xHADD;


 //////////////////////////////////////////////////////////////////////////////////////////
 //

+__emitinline void xEMMS()
+{
+	xWrite<u16>( 0x770F );
+}
+
 // Store Streaming SIMD Extension Control/Status to Mem32.
 __emitinline void xSTMXCSR( u32* dest )
 {
@ -885,5 +892,64 @@ __noinline void xMOVNTQ( const ModSibBase& to, const xRegisterMMX& from )	{ writ
 __forceinline void xMOVMSKPS( const xRegister32& to, xRegisterSSE& from)	{ writeXMMop( 0x50, to, from ); }
 __forceinline void xMOVMSKPD( const xRegister32& to, xRegisterSSE& from)	{ writeXMMop( 0x66, 0x50, to, from, true ); }

+//////////////////////////////////////////////////////////////////////////////////////////
+// INSERTPS / EXTRACTPS   [SSE4.1 only!]
+//
+// [TODO] these might be served better as classes, especially if other instructions use
+// the M32,sse,imm form (I forget offhand if any do).
+
+
+// [SSE-4.1] Insert a single-precision floating-point value from src into a specified
+// location in dest, and selectively zero out the data elements in dest according to
+// the mask  field in the immediate byte. The source operand can be a memory location
+// (32 bits) or an XMM register (lower 32 bits used).
+//
+// Imm8 provides three fields:
+//  * COUNT_S: The value of Imm8[7:6] selects the dword element from src.  It is 0 if
+//    the source is a memory operand.
+//  * COUNT_D: The value of Imm8[5:4] selects the target dword element in dest.
+//  * ZMASK: Each bit of Imm8[3:0] selects a dword element in dest to  be written
+//    with 0.0 if set to 1.
+//
+__emitinline void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 )
+{
+	writeXMMop( 0x66, 0x213a, to, from );
+	xWrite<u8>( imm8 );
+}
+
+__emitinline void xINSERTPS( const xRegisterSSE& to, const u32* from, u8 imm8 )
+{
+	writeXMMop( 0x66, 0x213a, to, from );
+	xWrite<u8>( imm8 );
+}
+
+__emitinline void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 )
+{
+	writeXMMop( 0x66, 0x213a, to, from );
+	xWrite<u8>( imm8 );
+}
+
+// [SSE-4.1] Extract a single-precision floating-point value from src at an offset
+// determined by imm8[1-0]*32. The extracted single precision floating-point value
+// is stored into the low 32-bits of dest (or at a 32-bit memory pointer).
+//
+__emitinline void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 )
+{
+	writeXMMop( 0x66, 0x173a, to, from, true );
+	xWrite<u8>( imm8 );
+}
+
+__emitinline void xEXTRACTPS( u32* dest, const xRegisterSSE& from, u8 imm8 )
+{
+	writeXMMop( 0x66, 0x173a, from, dest, true );
+	xWrite<u8>( imm8 );
+}
+
+__emitinline void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 )
+{
+	writeXMMop( 0x66, 0x173a, from, dest, true );
+	xWrite<u8>( imm8 );
+}
+

 }
--- a/pcsx2/x86/ix86/ix86_instructions.h
+++ b/pcsx2/x86/ix86/ix86_instructions.h
@ -384,6 +384,7 @@ namespace x86Emitter

 	// ------------------------------------------------------------------------

+	extern void xEMMS();
 	extern void xSTMXCSR( u32* dest );
 	extern void xLDMXCSR( const u32* src );

@ -429,6 +430,14 @@ namespace x86Emitter
 	extern void xMOVMSKPS( const xRegister32& to, xRegisterSSE& from );
 	extern void xMOVMSKPD( const xRegister32& to, xRegisterSSE& from );

+	extern void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 );
+	extern void xINSERTPS( const xRegisterSSE& to, const u32* from, u8 imm8 );
+	extern void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 );
+
+	extern void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 );
+	extern void xEXTRACTPS( u32* dest, const xRegisterSSE& from, u8 imm8 );
+	extern void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 );
+
 	// ------------------------------------------------------------------------

 	extern const Internal::SimdImpl_DestRegSSE<0xf3,0x12> xMOVSLDUP;
@ -536,8 +545,10 @@ namespace x86Emitter
 	
 	extern const Internal::SimdImpl_PAbsolute xPABS;
 	extern const Internal::SimdImpl_PSign xPSIGN;
-	extern const Internal::SimdImpl_PInsert xPINS;
+	extern const Internal::SimdImpl_PInsert xPINSR;
 	extern const Internal::SimdImpl_PExtract xPEXTR;
+	extern const Internal::SimdImpl_PMultAdd xPMADD;
+	extern const Internal::SimdImpl_HorizAdd xHADD;

 }

--- a/pcsx2/x86/ix86/ix86_legacy_mmx.cpp
+++ b/pcsx2/x86/ix86/ix86_legacy_mmx.cpp
@ -119,19 +119,6 @@ emitterT void PMULUDQRtoR( x86MMXRegType to, x86MMXRegType from )			{ xPMUL.UDQ(
 emitterT void PSHUFWRtoR(x86MMXRegType to, x86MMXRegType from, u8 imm8)		{ xPSHUF.W( xRegisterMMX(to), xRegisterMMX(from), imm8 ); }
 emitterT void PSHUFWMtoR(x86MMXRegType to, uptr from, u8 imm8)				{ xPSHUF.W( xRegisterMMX(to), (void*)from, imm8 ); }

-//////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////
+emitterT void PINSRWRtoMMX( x86MMXRegType to, x86SSERegType from, u8 imm8 ) { xPINSR.W( xRegisterMMX(to), xRegister32(from), imm8 ); }

-/* emms */
-emitterT void EMMS() 
-{
-	write16( 0x770F );
-}
-
-emitterT void PINSRWRtoMMX( x86MMXRegType to, x86SSERegType from, u8 imm8 )
-{
-	if (to > 7 || from > 7) Rex(1, to >> 3, 0, from >> 3);
-	write16( 0xc40f );
-	ModRM( 3, to, from );
-	write8( imm8 );
-}
+emitterT void EMMS() { xEMMS(); }
--- a/pcsx2/x86/ix86/ix86_legacy_sse.cpp
+++ b/pcsx2/x86/ix86/ix86_legacy_sse.cpp
@ -278,7 +278,10 @@ emitterT void SSSE3_PSIGNW_XMM_to_XMM(x86SSERegType to, x86SSERegType from)		{ x
 emitterT void SSSE3_PSIGND_XMM_to_XMM(x86SSERegType to, x86SSERegType from)		{ xPSIGN.D( xRegisterSSE(to), xRegisterSSE(from) ); }

 emitterT void SSE_PEXTRW_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8 )	{ xPEXTR.W( xRegister32(to), xRegisterSSE(from), imm8 ); }
-emitterT void SSE_PINSRW_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8 )	{ xPINS.W( xRegisterSSE(to), xRegister32(from), imm8 ); }
+emitterT void SSE_PINSRW_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8 )	{ xPINSR.W( xRegisterSSE(to), xRegister32(from), imm8 ); }
+
+emitterT void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8)		{ xINSERTPS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); }
+emitterT void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8)		{ xEXTRACTPS( xRegister32(to), xRegisterSSE(from), imm8 ); }

 emitterT void SSE_LDMXCSR( uptr from ) { xLDMXCSR( (u32*)from ); }

@ -290,10 +293,10 @@ emitterT void SSE_LDMXCSR( uptr from ) { xLDMXCSR( (u32*)from ); }
 //PEXTRW,PINSRW: Packed Extract/Insert Word                                        *
 //**********************************************************************************}

-emitterT void SSE2_PMADDWD_XMM_to_XMM(x86SSERegType to, x86SSERegType from)	{ SSERtoR66(0xF50F); }
+emitterT void SSE2_PMADDWD_XMM_to_XMM(x86SSERegType to, x86SSERegType from)			{ xPMADD.WD( xRegisterSSE(from), xRegisterSSE(to) ); }

-emitterT void SSE3_HADDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from)		{ write8(0xf2); SSERtoR( 0x7c0f ); }
-emitterT void SSE3_HADDPS_M128_to_XMM(x86SSERegType to, uptr from)				{ write8(0xf2); SSEMtoR( 0x7c0f, 0 ); }
+emitterT void SSE3_HADDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from)			{ xHADD.PS( xRegisterSSE(from), xRegisterSSE(to) ); }
+emitterT void SSE3_HADDPS_M128_to_XMM(x86SSERegType to, uptr from)					{ xHADD.PS( xRegisterSSE(from), (void*)to ); }


 // SSE4.1
@ -315,24 +318,6 @@ emitterT void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8)
 	write8(imm8);
 }

-emitterT void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8)
-{
-	write8(0x66);
-    RexRB(0, to, from);
-	write24(0x213A0F);
-	ModRM(3, to, from);
-	write8(imm8);
-}
-
-emitterT void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8)
-{
-	write8(0x66);
-    RexRB(0, to, from);
-	write24(0x173A0F);
-	ModRM(3, to, from);
-	write8(imm8);
-}
-
 emitterT void SSE4_BLENDPS_XMM_to_XMM(x86IntRegType to, x86SSERegType from, u8 imm8)
 {
 	write8(0x66);