Added PMADD/HADD/INSERT/EXTRACT instructions, and fixed more cross-compiler problems.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1038 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-04-21 21:30:47 +00:00
parent 2dcee32079
commit 27a8f3aa9a
7 changed files with 206 additions and 82 deletions

View File

@ -19,17 +19,14 @@
#pragma once
//////////////////////////////////////////////////////////////////////////////////////////
// Used for PSRA, which lacks the Q form.
// ShiftHelper -- It's out here because C++ child class template semantics are generally
// not cross-compiler friendly.
//
template< u16 OpcodeBase1, u8 Modcode >
class SimdImpl_ShiftWithoutQ
{
protected:
template< u16 Opcode1, u16 OpcodeImm >
class ShiftHelper
template< u16 Opcode1, u16 OpcodeImm, u8 Modcode >
class _SimdShiftHelper
{
public:
ShiftHelper() {}
_SimdShiftHelper() {}
template< typename OperandType >
__forceinline void operator()( const xRegisterSIMD<OperandType>& to, const xRegisterSIMD<OperandType>& from ) const
@ -58,9 +55,15 @@ protected:
}
};
//////////////////////////////////////////////////////////////////////////////////////////
// Used for PSRA, which lacks the Q form.
//
template< u16 OpcodeBase1, u8 Modcode >
class SimdImpl_ShiftWithoutQ
{
public:
const ShiftHelper<OpcodeBase1+1,0x71> W;
const ShiftHelper<OpcodeBase1+2,0x72> D;
const _SimdShiftHelper<OpcodeBase1+1,0x71,Modcode> W;
const _SimdShiftHelper<OpcodeBase1+2,0x72,Modcode> D;
SimdImpl_ShiftWithoutQ() {}
};
@ -72,7 +75,7 @@ template< u16 OpcodeBase1, u8 Modcode >
class SimdImpl_Shift : public SimdImpl_ShiftWithoutQ<OpcodeBase1, Modcode>
{
public:
const ShiftHelper<OpcodeBase1+3,0x73> Q;
const _SimdShiftHelper<OpcodeBase1+3,0x73,Modcode> Q;
void DQ( const xRegisterSSE& to, u8 imm ) const
{
@ -228,3 +231,63 @@ public:
const SimdImpl_DestRegEither<0x66, 0x0a38> D;
};
//////////////////////////////////////////////////////////////////////////////////////////
// Packed Multiply and Add!!
//
class SimdImpl_PMultAdd
{
public:
SimdImpl_PMultAdd() {}
// Multiplies the individual signed words of dest by the corresponding signed words
// of src, producing temporary signed, doubleword results. The adjacent doubleword
// results are then summed and stored in the destination operand.
//
// DEST[31:0] = ( DEST[15:0] * SRC[15:0]) + (DEST[31:16] * SRC[31:16] );
// DEST[63:32] = ( DEST[47:32] * SRC[47:32]) + (DEST[63:48] * SRC[63:48] );
// [.. repeat in the case of XMM src/dest operands ..]
//
const SimdImpl_DestRegEither<0x66, 0xf5> WD;
// [sSSE-3] multiplies vertically each unsigned byte of dest with the corresponding
// signed byte of src, producing intermediate signed 16-bit integers. Each adjacent
// pair of signed words is added and the saturated result is packed to dest.
// For example, the lowest-order bytes (bits 7-0) in src and dest are multiplied
// and the intermediate signed word result is added with the corresponding
// intermediate result from the 2nd lowest-order bytes (bits 15-8) of the operands;
// the sign-saturated result is stored in the lowest word of dest (bits 15-0).
// The same operation is performed on the other pairs of adjacent bytes.
//
// In Coder Speak:
// DEST[15-0] = SaturateToSignedWord( SRC[15-8] * DEST[15-8] + SRC[7-0] * DEST[7-0] );
// DEST[31-16] = SaturateToSignedWord( SRC[31-24] * DEST[31-24] + SRC[23-16] * DEST[23-16] );
// [.. repeat for each 16 bits up to 64 (mmx) or 128 (xmm) ..]
//
const SimdImpl_DestRegEither<0x66, 0xf438> UBSW;
};
//////////////////////////////////////////////////////////////////////////////////////////
// Packed Horizontal Add [SSE3 only]
//
class SimdImpl_HorizAdd
{
public:
SimdImpl_HorizAdd() {}
// [SSE-3] Horizontal Add of Packed Data. A three step process:
// * Adds the single-precision floating-point values in the first and second dwords of
// dest and stores the result in the first dword of dest.
// * Adds single-precision floating-point values in the third and fourth dword of dest
// stores the result in the second dword of dest.
// * Adds single-precision floating-point values in the first and second dword of *src*
// and stores the result in the third dword of dest.
const SimdImpl_DestRegSSE<0xf2, 0x7c> PS;
// [SSE-3] Horizontal Add of Packed Data. A two step process:
// * Adds the double-precision floating-point values in the high and low quadwords of
// dest and stores the result in the low quadword of dest.
// * Adds the double-precision floating-point values in the high and low quadwords of
// *src* stores the result in the high quadword of dest.
const SimdImpl_DestRegSSE<0x66, 0x7c> PD;
};

View File

@ -80,3 +80,14 @@ public:
const SimdImpl_DestRegSSE<AltPrefix,OpcodeSSE> SD;
SimdImpl_UcomI() {}
};
//////////////////////////////////////////////////////////////////////////////////////////
//
class SimdImpl_Blend
{
SimdImpl_DestRegImmSSE<0x66,0x0c3a> PS;
SimdImpl_DestRegImmSSE<0x66,0x0d3a> PD;
SimdImpl_DestRegImmSSE<0x66,0x1438> VPS;
SimdImpl_DestRegImmSSE<0x66,0x1538> VPD;
};

View File

@ -169,7 +169,7 @@ public:
};
//////////////////////////////////////////////////////////////////////////////////////////
// PINSW/B/D [all but Word form are SSE4.1 only!]
// PINSRW/B/D [all but Word form are SSE4.1 only!]
//
class SimdImpl_PInsert
{
@ -183,19 +183,19 @@ protected:
__forceinline void operator()( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const
{
writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
xWrite<u8>( imm );
xWrite<u8>( imm8 );
}
__forceinline void operator()( const xRegisterSSE& to, const void* from, u8 imm8 ) const
{
writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
xWrite<u8>( imm );
xWrite<u8>( imm8 );
}
__noinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const
{
writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
xWrite<u8>( imm );
xWrite<u8>( imm8 );
}
};
@ -256,13 +256,13 @@ protected:
__forceinline void operator()( void* dest, const xRegisterSSE& from, u8 imm8 ) const
{
writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
writeXMMop( 0x66, (Opcode<<8) | 0x3a, from, dest );
xWrite<u8>( imm8 );
}
__noinline void operator()( const ModSibBase& dest, const xRegisterSSE& from, u8 imm8 ) const
{
writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
writeXMMop( 0x66, (Opcode<<8) | 0x3a, from, dest );
xWrite<u8>( imm8 );
}
};
@ -304,3 +304,4 @@ public:
// used to extract any single packed dword value from src into an x86 32 bit register.
const ByteDwordForms<0x16> D;
};

View File

@ -784,13 +784,20 @@ const SimdImpl_Pack xPACK;
const SimdImpl_PAbsolute xPABS;
const SimdImpl_PSign xPSIGN;
const SimdImpl_PInsert xPINS;
const SimdImpl_PInsert xPINSR;
const SimdImpl_PExtract xPEXTR;
const SimdImpl_PMultAdd xPMADD;
const SimdImpl_HorizAdd xHADD;
//////////////////////////////////////////////////////////////////////////////////////////
//
__emitinline void xEMMS()
{
xWrite<u16>( 0x770F );
}
// Store Streaming SIMD Extension Control/Status to Mem32.
__emitinline void xSTMXCSR( u32* dest )
{
@ -885,5 +892,64 @@ __noinline void xMOVNTQ( const ModSibBase& to, const xRegisterMMX& from ) { writ
__forceinline void xMOVMSKPS( const xRegister32& to, xRegisterSSE& from) { writeXMMop( 0x50, to, from ); }
__forceinline void xMOVMSKPD( const xRegister32& to, xRegisterSSE& from) { writeXMMop( 0x66, 0x50, to, from, true ); }
//////////////////////////////////////////////////////////////////////////////////////////
// INSERTPS / EXTRACTPS [SSE4.1 only!]
//
// [TODO] these might be served better as classes, especially if other instructions use
// the M32,sse,imm form (I forget offhand if any do).
// [SSE-4.1] Insert a single-precision floating-point value from src into a specified
// location in dest, and selectively zero out the data elements in dest according to
// the mask field in the immediate byte. The source operand can be a memory location
// (32 bits) or an XMM register (lower 32 bits used).
//
// Imm8 provides three fields:
// * COUNT_S: The value of Imm8[7:6] selects the dword element from src. It is 0 if
// the source is a memory operand.
// * COUNT_D: The value of Imm8[5:4] selects the target dword element in dest.
// * ZMASK: Each bit of Imm8[3:0] selects a dword element in dest to be written
// with 0.0 if set to 1.
//
__emitinline void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 )
{
writeXMMop( 0x66, 0x213a, to, from );
xWrite<u8>( imm8 );
}
__emitinline void xINSERTPS( const xRegisterSSE& to, const u32* from, u8 imm8 )
{
writeXMMop( 0x66, 0x213a, to, from );
xWrite<u8>( imm8 );
}
__emitinline void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 )
{
writeXMMop( 0x66, 0x213a, to, from );
xWrite<u8>( imm8 );
}
// [SSE-4.1] Extract a single-precision floating-point value from src at an offset
// determined by imm8[1-0]*32. The extracted single precision floating-point value
// is stored into the low 32-bits of dest (or at a 32-bit memory pointer).
//
__emitinline void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 )
{
writeXMMop( 0x66, 0x173a, to, from, true );
xWrite<u8>( imm8 );
}
__emitinline void xEXTRACTPS( u32* dest, const xRegisterSSE& from, u8 imm8 )
{
writeXMMop( 0x66, 0x173a, from, dest, true );
xWrite<u8>( imm8 );
}
__emitinline void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 )
{
writeXMMop( 0x66, 0x173a, from, dest, true );
xWrite<u8>( imm8 );
}
}

View File

@ -384,6 +384,7 @@ namespace x86Emitter
// ------------------------------------------------------------------------
extern void xEMMS();
extern void xSTMXCSR( u32* dest );
extern void xLDMXCSR( const u32* src );
@ -429,6 +430,14 @@ namespace x86Emitter
extern void xMOVMSKPS( const xRegister32& to, xRegisterSSE& from );
extern void xMOVMSKPD( const xRegister32& to, xRegisterSSE& from );
extern void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 );
extern void xINSERTPS( const xRegisterSSE& to, const u32* from, u8 imm8 );
extern void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 );
extern void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 );
extern void xEXTRACTPS( u32* dest, const xRegisterSSE& from, u8 imm8 );
extern void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 );
// ------------------------------------------------------------------------
extern const Internal::SimdImpl_DestRegSSE<0xf3,0x12> xMOVSLDUP;
@ -536,8 +545,10 @@ namespace x86Emitter
extern const Internal::SimdImpl_PAbsolute xPABS;
extern const Internal::SimdImpl_PSign xPSIGN;
extern const Internal::SimdImpl_PInsert xPINS;
extern const Internal::SimdImpl_PInsert xPINSR;
extern const Internal::SimdImpl_PExtract xPEXTR;
extern const Internal::SimdImpl_PMultAdd xPMADD;
extern const Internal::SimdImpl_HorizAdd xHADD;
}

View File

@ -119,19 +119,6 @@ emitterT void PMULUDQRtoR( x86MMXRegType to, x86MMXRegType from ) { xPMUL.UDQ(
emitterT void PSHUFWRtoR(x86MMXRegType to, x86MMXRegType from, u8 imm8) { xPSHUF.W( xRegisterMMX(to), xRegisterMMX(from), imm8 ); }
emitterT void PSHUFWMtoR(x86MMXRegType to, uptr from, u8 imm8) { xPSHUF.W( xRegisterMMX(to), (void*)from, imm8 ); }
//////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////
emitterT void PINSRWRtoMMX( x86MMXRegType to, x86SSERegType from, u8 imm8 ) { xPINSR.W( xRegisterMMX(to), xRegister32(from), imm8 ); }
/* emms */
emitterT void EMMS()
{
write16( 0x770F );
}
emitterT void PINSRWRtoMMX( x86MMXRegType to, x86SSERegType from, u8 imm8 )
{
if (to > 7 || from > 7) Rex(1, to >> 3, 0, from >> 3);
write16( 0xc40f );
ModRM( 3, to, from );
write8( imm8 );
}
emitterT void EMMS() { xEMMS(); }

View File

@ -278,7 +278,10 @@ emitterT void SSSE3_PSIGNW_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { x
emitterT void SSSE3_PSIGND_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { xPSIGN.D( xRegisterSSE(to), xRegisterSSE(from) ); }
emitterT void SSE_PEXTRW_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8 ) { xPEXTR.W( xRegister32(to), xRegisterSSE(from), imm8 ); }
emitterT void SSE_PINSRW_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8 ) { xPINS.W( xRegisterSSE(to), xRegister32(from), imm8 ); }
emitterT void SSE_PINSRW_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8 ) { xPINSR.W( xRegisterSSE(to), xRegister32(from), imm8 ); }
emitterT void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8) { xINSERTPS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); }
emitterT void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8) { xEXTRACTPS( xRegister32(to), xRegisterSSE(from), imm8 ); }
emitterT void SSE_LDMXCSR( uptr from ) { xLDMXCSR( (u32*)from ); }
@ -290,10 +293,10 @@ emitterT void SSE_LDMXCSR( uptr from ) { xLDMXCSR( (u32*)from ); }
//PEXTRW,PINSRW: Packed Extract/Insert Word *
//**********************************************************************************}
emitterT void SSE2_PMADDWD_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { SSERtoR66(0xF50F); }
emitterT void SSE2_PMADDWD_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { xPMADD.WD( xRegisterSSE(from), xRegisterSSE(to) ); }
emitterT void SSE3_HADDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { write8(0xf2); SSERtoR( 0x7c0f ); }
emitterT void SSE3_HADDPS_M128_to_XMM(x86SSERegType to, uptr from) { write8(0xf2); SSEMtoR( 0x7c0f, 0 ); }
emitterT void SSE3_HADDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { xHADD.PS( xRegisterSSE(from), xRegisterSSE(to) ); }
emitterT void SSE3_HADDPS_M128_to_XMM(x86SSERegType to, uptr from) { xHADD.PS( xRegisterSSE(from), (void*)to ); }
// SSE4.1
@ -315,24 +318,6 @@ emitterT void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8)
write8(imm8);
}
emitterT void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8)
{
write8(0x66);
RexRB(0, to, from);
write24(0x213A0F);
ModRM(3, to, from);
write8(imm8);
}
emitterT void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8)
{
write8(0x66);
RexRB(0, to, from);
write24(0x173A0F);
ModRM(3, to, from);
write8(imm8);
}
emitterT void SSE4_BLENDPS_XMM_to_XMM(x86IntRegType to, x86SSERegType from, u8 imm8)
{
write8(0x66);