mirror of https://github.com/PCSX2/pcsx2.git
Added PMADD/HADD/INSERT/EXTRACT instructions, and fixed more cross-compiler problems.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1038 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
2dcee32079
commit
27a8f3aa9a
|
@ -19,17 +19,14 @@
|
|||
#pragma once
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Used for PSRA, which lacks the Q form.
|
||||
// ShiftHelper -- It's out here because C++ child class template semantics are generally
|
||||
// not cross-compiler friendly.
|
||||
//
|
||||
template< u16 OpcodeBase1, u8 Modcode >
|
||||
class SimdImpl_ShiftWithoutQ
|
||||
{
|
||||
protected:
|
||||
template< u16 Opcode1, u16 OpcodeImm >
|
||||
class ShiftHelper
|
||||
template< u16 Opcode1, u16 OpcodeImm, u8 Modcode >
|
||||
class _SimdShiftHelper
|
||||
{
|
||||
public:
|
||||
ShiftHelper() {}
|
||||
_SimdShiftHelper() {}
|
||||
|
||||
template< typename OperandType >
|
||||
__forceinline void operator()( const xRegisterSIMD<OperandType>& to, const xRegisterSIMD<OperandType>& from ) const
|
||||
|
@ -58,9 +55,15 @@ protected:
|
|||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Used for PSRA, which lacks the Q form.
|
||||
//
|
||||
template< u16 OpcodeBase1, u8 Modcode >
|
||||
class SimdImpl_ShiftWithoutQ
|
||||
{
|
||||
public:
|
||||
const ShiftHelper<OpcodeBase1+1,0x71> W;
|
||||
const ShiftHelper<OpcodeBase1+2,0x72> D;
|
||||
const _SimdShiftHelper<OpcodeBase1+1,0x71,Modcode> W;
|
||||
const _SimdShiftHelper<OpcodeBase1+2,0x72,Modcode> D;
|
||||
|
||||
SimdImpl_ShiftWithoutQ() {}
|
||||
};
|
||||
|
@ -72,7 +75,7 @@ template< u16 OpcodeBase1, u8 Modcode >
|
|||
class SimdImpl_Shift : public SimdImpl_ShiftWithoutQ<OpcodeBase1, Modcode>
|
||||
{
|
||||
public:
|
||||
const ShiftHelper<OpcodeBase1+3,0x73> Q;
|
||||
const _SimdShiftHelper<OpcodeBase1+3,0x73,Modcode> Q;
|
||||
|
||||
void DQ( const xRegisterSSE& to, u8 imm ) const
|
||||
{
|
||||
|
@ -228,3 +231,63 @@ public:
|
|||
const SimdImpl_DestRegEither<0x66, 0x0a38> D;
|
||||
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Packed Multiply and Add!!
|
||||
//
|
||||
class SimdImpl_PMultAdd
|
||||
{
|
||||
public:
|
||||
SimdImpl_PMultAdd() {}
|
||||
|
||||
// Multiplies the individual signed words of dest by the corresponding signed words
|
||||
// of src, producing temporary signed, doubleword results. The adjacent doubleword
|
||||
// results are then summed and stored in the destination operand.
|
||||
//
|
||||
// DEST[31:0] = ( DEST[15:0] * SRC[15:0]) + (DEST[31:16] * SRC[31:16] );
|
||||
// DEST[63:32] = ( DEST[47:32] * SRC[47:32]) + (DEST[63:48] * SRC[63:48] );
|
||||
// [.. repeat in the case of XMM src/dest operands ..]
|
||||
//
|
||||
const SimdImpl_DestRegEither<0x66, 0xf5> WD;
|
||||
|
||||
// [sSSE-3] multiplies vertically each unsigned byte of dest with the corresponding
|
||||
// signed byte of src, producing intermediate signed 16-bit integers. Each adjacent
|
||||
// pair of signed words is added and the saturated result is packed to dest.
|
||||
// For example, the lowest-order bytes (bits 7-0) in src and dest are multiplied
|
||||
// and the intermediate signed word result is added with the corresponding
|
||||
// intermediate result from the 2nd lowest-order bytes (bits 15-8) of the operands;
|
||||
// the sign-saturated result is stored in the lowest word of dest (bits 15-0).
|
||||
// The same operation is performed on the other pairs of adjacent bytes.
|
||||
//
|
||||
// In Coder Speak:
|
||||
// DEST[15-0] = SaturateToSignedWord( SRC[15-8] * DEST[15-8] + SRC[7-0] * DEST[7-0] );
|
||||
// DEST[31-16] = SaturateToSignedWord( SRC[31-24] * DEST[31-24] + SRC[23-16] * DEST[23-16] );
|
||||
// [.. repeat for each 16 bits up to 64 (mmx) or 128 (xmm) ..]
|
||||
//
|
||||
const SimdImpl_DestRegEither<0x66, 0xf438> UBSW;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Packed Horizontal Add [SSE3 only]
|
||||
//
|
||||
class SimdImpl_HorizAdd
|
||||
{
|
||||
public:
|
||||
SimdImpl_HorizAdd() {}
|
||||
|
||||
// [SSE-3] Horizontal Add of Packed Data. A three step process:
|
||||
// * Adds the single-precision floating-point values in the first and second dwords of
|
||||
// dest and stores the result in the first dword of dest.
|
||||
// * Adds single-precision floating-point values in the third and fourth dword of dest
|
||||
// stores the result in the second dword of dest.
|
||||
// * Adds single-precision floating-point values in the first and second dword of *src*
|
||||
// and stores the result in the third dword of dest.
|
||||
const SimdImpl_DestRegSSE<0xf2, 0x7c> PS;
|
||||
|
||||
// [SSE-3] Horizontal Add of Packed Data. A two step process:
|
||||
// * Adds the double-precision floating-point values in the high and low quadwords of
|
||||
// dest and stores the result in the low quadword of dest.
|
||||
// * Adds the double-precision floating-point values in the high and low quadwords of
|
||||
// *src* stores the result in the high quadword of dest.
|
||||
const SimdImpl_DestRegSSE<0x66, 0x7c> PD;
|
||||
};
|
|
@ -80,3 +80,14 @@ public:
|
|||
const SimdImpl_DestRegSSE<AltPrefix,OpcodeSSE> SD;
|
||||
SimdImpl_UcomI() {}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
class SimdImpl_Blend
|
||||
{
|
||||
SimdImpl_DestRegImmSSE<0x66,0x0c3a> PS;
|
||||
SimdImpl_DestRegImmSSE<0x66,0x0d3a> PD;
|
||||
|
||||
SimdImpl_DestRegImmSSE<0x66,0x1438> VPS;
|
||||
SimdImpl_DestRegImmSSE<0x66,0x1538> VPD;
|
||||
};
|
|
@ -169,7 +169,7 @@ public:
|
|||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// PINSW/B/D [all but Word form are SSE4.1 only!]
|
||||
// PINSRW/B/D [all but Word form are SSE4.1 only!]
|
||||
//
|
||||
class SimdImpl_PInsert
|
||||
{
|
||||
|
@ -183,19 +183,19 @@ protected:
|
|||
__forceinline void operator()( const xRegisterSSE& to, const xRegister32& from, u8 imm8 ) const
|
||||
{
|
||||
writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
|
||||
xWrite<u8>( imm );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
|
||||
__forceinline void operator()( const xRegisterSSE& to, const void* from, u8 imm8 ) const
|
||||
{
|
||||
writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
|
||||
xWrite<u8>( imm );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
|
||||
__noinline void operator()( const xRegisterSSE& to, const ModSibBase& from, u8 imm8 ) const
|
||||
{
|
||||
writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
|
||||
xWrite<u8>( imm );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -256,13 +256,13 @@ protected:
|
|||
|
||||
__forceinline void operator()( void* dest, const xRegisterSSE& from, u8 imm8 ) const
|
||||
{
|
||||
writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
|
||||
writeXMMop( 0x66, (Opcode<<8) | 0x3a, from, dest );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
|
||||
__noinline void operator()( const ModSibBase& dest, const xRegisterSSE& from, u8 imm8 ) const
|
||||
{
|
||||
writeXMMop( 0x66, (Opcode<<8) | 0x3a, to, from );
|
||||
writeXMMop( 0x66, (Opcode<<8) | 0x3a, from, dest );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
};
|
||||
|
@ -304,3 +304,4 @@ public:
|
|||
// used to extract any single packed dword value from src into an x86 32 bit register.
|
||||
const ByteDwordForms<0x16> D;
|
||||
};
|
||||
|
||||
|
|
|
@ -784,13 +784,20 @@ const SimdImpl_Pack xPACK;
|
|||
|
||||
const SimdImpl_PAbsolute xPABS;
|
||||
const SimdImpl_PSign xPSIGN;
|
||||
const SimdImpl_PInsert xPINS;
|
||||
const SimdImpl_PInsert xPINSR;
|
||||
const SimdImpl_PExtract xPEXTR;
|
||||
const SimdImpl_PMultAdd xPMADD;
|
||||
const SimdImpl_HorizAdd xHADD;
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
|
||||
__emitinline void xEMMS()
|
||||
{
|
||||
xWrite<u16>( 0x770F );
|
||||
}
|
||||
|
||||
// Store Streaming SIMD Extension Control/Status to Mem32.
|
||||
__emitinline void xSTMXCSR( u32* dest )
|
||||
{
|
||||
|
@ -885,5 +892,64 @@ __noinline void xMOVNTQ( const ModSibBase& to, const xRegisterMMX& from ) { writ
|
|||
__forceinline void xMOVMSKPS( const xRegister32& to, xRegisterSSE& from) { writeXMMop( 0x50, to, from ); }
|
||||
__forceinline void xMOVMSKPD( const xRegister32& to, xRegisterSSE& from) { writeXMMop( 0x66, 0x50, to, from, true ); }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// INSERTPS / EXTRACTPS [SSE4.1 only!]
|
||||
//
|
||||
// [TODO] these might be served better as classes, especially if other instructions use
|
||||
// the M32,sse,imm form (I forget offhand if any do).
|
||||
|
||||
|
||||
// [SSE-4.1] Insert a single-precision floating-point value from src into a specified
|
||||
// location in dest, and selectively zero out the data elements in dest according to
|
||||
// the mask field in the immediate byte. The source operand can be a memory location
|
||||
// (32 bits) or an XMM register (lower 32 bits used).
|
||||
//
|
||||
// Imm8 provides three fields:
|
||||
// * COUNT_S: The value of Imm8[7:6] selects the dword element from src. It is 0 if
|
||||
// the source is a memory operand.
|
||||
// * COUNT_D: The value of Imm8[5:4] selects the target dword element in dest.
|
||||
// * ZMASK: Each bit of Imm8[3:0] selects a dword element in dest to be written
|
||||
// with 0.0 if set to 1.
|
||||
//
|
||||
__emitinline void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 )
|
||||
{
|
||||
writeXMMop( 0x66, 0x213a, to, from );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
|
||||
__emitinline void xINSERTPS( const xRegisterSSE& to, const u32* from, u8 imm8 )
|
||||
{
|
||||
writeXMMop( 0x66, 0x213a, to, from );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
|
||||
__emitinline void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 )
|
||||
{
|
||||
writeXMMop( 0x66, 0x213a, to, from );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
|
||||
// [SSE-4.1] Extract a single-precision floating-point value from src at an offset
|
||||
// determined by imm8[1-0]*32. The extracted single precision floating-point value
|
||||
// is stored into the low 32-bits of dest (or at a 32-bit memory pointer).
|
||||
//
|
||||
__emitinline void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 )
|
||||
{
|
||||
writeXMMop( 0x66, 0x173a, to, from, true );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
|
||||
__emitinline void xEXTRACTPS( u32* dest, const xRegisterSSE& from, u8 imm8 )
|
||||
{
|
||||
writeXMMop( 0x66, 0x173a, from, dest, true );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
|
||||
__emitinline void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 )
|
||||
{
|
||||
writeXMMop( 0x66, 0x173a, from, dest, true );
|
||||
xWrite<u8>( imm8 );
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -384,6 +384,7 @@ namespace x86Emitter
|
|||
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
extern void xEMMS();
|
||||
extern void xSTMXCSR( u32* dest );
|
||||
extern void xLDMXCSR( const u32* src );
|
||||
|
||||
|
@ -429,6 +430,14 @@ namespace x86Emitter
|
|||
extern void xMOVMSKPS( const xRegister32& to, xRegisterSSE& from );
|
||||
extern void xMOVMSKPD( const xRegister32& to, xRegisterSSE& from );
|
||||
|
||||
extern void xINSERTPS( const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8 );
|
||||
extern void xINSERTPS( const xRegisterSSE& to, const u32* from, u8 imm8 );
|
||||
extern void xINSERTPS( const xRegisterSSE& to, const ModSibStrict<u32>& from, u8 imm8 );
|
||||
|
||||
extern void xEXTRACTPS( const xRegister32& to, const xRegisterSSE& from, u8 imm8 );
|
||||
extern void xEXTRACTPS( u32* dest, const xRegisterSSE& from, u8 imm8 );
|
||||
extern void xEXTRACTPS( const ModSibStrict<u32>& dest, const xRegisterSSE& from, u8 imm8 );
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
extern const Internal::SimdImpl_DestRegSSE<0xf3,0x12> xMOVSLDUP;
|
||||
|
@ -536,8 +545,10 @@ namespace x86Emitter
|
|||
|
||||
extern const Internal::SimdImpl_PAbsolute xPABS;
|
||||
extern const Internal::SimdImpl_PSign xPSIGN;
|
||||
extern const Internal::SimdImpl_PInsert xPINS;
|
||||
extern const Internal::SimdImpl_PInsert xPINSR;
|
||||
extern const Internal::SimdImpl_PExtract xPEXTR;
|
||||
extern const Internal::SimdImpl_PMultAdd xPMADD;
|
||||
extern const Internal::SimdImpl_HorizAdd xHADD;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -119,19 +119,6 @@ emitterT void PMULUDQRtoR( x86MMXRegType to, x86MMXRegType from ) { xPMUL.UDQ(
|
|||
emitterT void PSHUFWRtoR(x86MMXRegType to, x86MMXRegType from, u8 imm8) { xPSHUF.W( xRegisterMMX(to), xRegisterMMX(from), imm8 ); }
|
||||
emitterT void PSHUFWMtoR(x86MMXRegType to, uptr from, u8 imm8) { xPSHUF.W( xRegisterMMX(to), (void*)from, imm8 ); }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
emitterT void PINSRWRtoMMX( x86MMXRegType to, x86SSERegType from, u8 imm8 ) { xPINSR.W( xRegisterMMX(to), xRegister32(from), imm8 ); }
|
||||
|
||||
/* emms */
|
||||
emitterT void EMMS()
|
||||
{
|
||||
write16( 0x770F );
|
||||
}
|
||||
|
||||
emitterT void PINSRWRtoMMX( x86MMXRegType to, x86SSERegType from, u8 imm8 )
|
||||
{
|
||||
if (to > 7 || from > 7) Rex(1, to >> 3, 0, from >> 3);
|
||||
write16( 0xc40f );
|
||||
ModRM( 3, to, from );
|
||||
write8( imm8 );
|
||||
}
|
||||
emitterT void EMMS() { xEMMS(); }
|
||||
|
|
|
@ -278,7 +278,10 @@ emitterT void SSSE3_PSIGNW_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { x
|
|||
emitterT void SSSE3_PSIGND_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { xPSIGN.D( xRegisterSSE(to), xRegisterSSE(from) ); }
|
||||
|
||||
emitterT void SSE_PEXTRW_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8 ) { xPEXTR.W( xRegister32(to), xRegisterSSE(from), imm8 ); }
|
||||
emitterT void SSE_PINSRW_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8 ) { xPINS.W( xRegisterSSE(to), xRegister32(from), imm8 ); }
|
||||
emitterT void SSE_PINSRW_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 imm8 ) { xPINSR.W( xRegisterSSE(to), xRegister32(from), imm8 ); }
|
||||
|
||||
emitterT void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8) { xINSERTPS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); }
|
||||
emitterT void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8) { xEXTRACTPS( xRegister32(to), xRegisterSSE(from), imm8 ); }
|
||||
|
||||
emitterT void SSE_LDMXCSR( uptr from ) { xLDMXCSR( (u32*)from ); }
|
||||
|
||||
|
@ -290,10 +293,10 @@ emitterT void SSE_LDMXCSR( uptr from ) { xLDMXCSR( (u32*)from ); }
|
|||
//PEXTRW,PINSRW: Packed Extract/Insert Word *
|
||||
//**********************************************************************************}
|
||||
|
||||
emitterT void SSE2_PMADDWD_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { SSERtoR66(0xF50F); }
|
||||
emitterT void SSE2_PMADDWD_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { xPMADD.WD( xRegisterSSE(from), xRegisterSSE(to) ); }
|
||||
|
||||
emitterT void SSE3_HADDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { write8(0xf2); SSERtoR( 0x7c0f ); }
|
||||
emitterT void SSE3_HADDPS_M128_to_XMM(x86SSERegType to, uptr from) { write8(0xf2); SSEMtoR( 0x7c0f, 0 ); }
|
||||
emitterT void SSE3_HADDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from) { xHADD.PS( xRegisterSSE(from), xRegisterSSE(to) ); }
|
||||
emitterT void SSE3_HADDPS_M128_to_XMM(x86SSERegType to, uptr from) { xHADD.PS( xRegisterSSE(from), (void*)to ); }
|
||||
|
||||
|
||||
// SSE4.1
|
||||
|
@ -315,24 +318,6 @@ emitterT void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8)
|
|||
write8(imm8);
|
||||
}
|
||||
|
||||
emitterT void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8)
|
||||
{
|
||||
write8(0x66);
|
||||
RexRB(0, to, from);
|
||||
write24(0x213A0F);
|
||||
ModRM(3, to, from);
|
||||
write8(imm8);
|
||||
}
|
||||
|
||||
emitterT void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8)
|
||||
{
|
||||
write8(0x66);
|
||||
RexRB(0, to, from);
|
||||
write24(0x173A0F);
|
||||
ModRM(3, to, from);
|
||||
write8(imm8);
|
||||
}
|
||||
|
||||
emitterT void SSE4_BLENDPS_XMM_to_XMM(x86IntRegType to, x86SSERegType from, u8 imm8)
|
||||
{
|
||||
write8(0x66);
|
||||
|
|
Loading…
Reference in New Issue