mirror of https://github.com/PCSX2/pcsx2.git
Added tmkk's PADDSW/PSUBSW opcodes which replace the slower interpreter ones we used so far. Great job on those tmkk!
git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@613 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
parent
d0a7068f2e
commit
cf156d3c27
|
@ -937,8 +937,104 @@ void recPADDSW( void )
|
|||
{
|
||||
if ( ! _Rd_ ) return;
|
||||
|
||||
//CPU_SSE2_XMMCACHE_START(XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED)
|
||||
//CPU_SSE_XMMCACHE_END
|
||||
CPU_SSE2_XMMCACHE_START(XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED)
|
||||
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
|
||||
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
|
||||
int t2reg = _allocTempXMMreg(XMMT_INT, -1);
|
||||
int t3reg = _allocTempXMMreg(XMMT_INT, -1);
|
||||
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions ) {
|
||||
SSE4_PMOVSXDQ_XMM_to_XMM(t0reg, EEREC_S);
|
||||
SSE4_PMOVSXDQ_XMM_to_XMM(t1reg, EEREC_T);
|
||||
SSE2_PADDQ_XMM_to_XMM(t0reg, t1reg);
|
||||
SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0x0e);
|
||||
SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0x0e);
|
||||
SSE4_PMOVSXDQ_XMM_to_XMM(t1reg, t1reg);
|
||||
SSE4_PMOVSXDQ_XMM_to_XMM(t2reg, t2reg);
|
||||
}
|
||||
else {
|
||||
SSEX_MOVDQA_XMM_to_XMM(t0reg, EEREC_S);
|
||||
SSEX_MOVDQA_XMM_to_XMM(t1reg, EEREC_T);
|
||||
SSE2_PXOR_XMM_to_XMM(t2reg, t2reg);
|
||||
SSE2_PXOR_XMM_to_XMM(t3reg, t3reg);
|
||||
SSE2_PCMPGTD_XMM_to_XMM(t2reg, t0reg);
|
||||
SSE2_PCMPGTD_XMM_to_XMM(t3reg, t1reg);
|
||||
SSE2_PUNPCKLDQ_XMM_to_XMM(t0reg, t2reg);
|
||||
SSE2_PUNPCKLDQ_XMM_to_XMM(t1reg, t3reg);
|
||||
SSE2_PADDQ_XMM_to_XMM(t0reg, t1reg);
|
||||
SSEX_MOVDQA_XMM_to_XMM(t1reg, EEREC_S);
|
||||
SSE2_PUNPCKHDQ_XMM_to_XMM(t1reg, t2reg);
|
||||
SSEX_MOVDQA_XMM_to_XMM(t2reg, EEREC_T);
|
||||
SSE2_PUNPCKHDQ_XMM_to_XMM(t2reg, t3reg);
|
||||
}
|
||||
SSE2_PADDQ_XMM_to_XMM(t1reg, t2reg);
|
||||
/*
|
||||
t0reg = { Rs[0]+Rt[0], Rs[1]+Rt[1] }
|
||||
t1reg = { Rs[2]+Rt[2], Rs[3]+Rt[3] }
|
||||
*/
|
||||
|
||||
SSEX_MOVDQA_XMM_to_XMM(t2reg, t0reg);
|
||||
SSE_SHUFPS_XMM_to_XMM(t2reg, t1reg, 0xdd);
|
||||
SSE2_PSRAD_I8_to_XMM(t2reg, 31);
|
||||
/*
|
||||
t2reg = { (Rs[0]+Rt[0]) < 0 ? 0xFFFFFFFF : 0,
|
||||
(Rs[1]+Rt[1]) < 0 ? 0xFFFFFFFF : 0,
|
||||
(Rs[2]+Rt[2]) < 0 ? 0xFFFFFFFF : 0,
|
||||
(Rs[3]+Rt[3]) < 0 ? 0xFFFFFFFF : 0 }
|
||||
*/
|
||||
|
||||
SSE2_PSHUFD_XMM_to_XMM(t3reg, t2reg, 0x50);
|
||||
SSE2_PXOR_XMM_to_XMM(t0reg, t3reg);
|
||||
SSE2_PSRLQ_I8_to_XMM(t3reg, 63);
|
||||
SSE2_PADDQ_XMM_to_XMM(t0reg, t3reg);
|
||||
/*
|
||||
t0reg = { abs(Rs[0]+Rt[0]), abs(Rs[1]+Rt[1]) }
|
||||
*/
|
||||
SSE2_PSHUFD_XMM_to_XMM(t3reg, t2reg, 0xfa);
|
||||
SSE2_PXOR_XMM_to_XMM(t1reg, t3reg);
|
||||
SSE2_PSRLQ_I8_to_XMM(t3reg, 63);
|
||||
SSE2_PADDQ_XMM_to_XMM(t1reg, t3reg);
|
||||
/*
|
||||
t1reg = { abs(Rs[2]+Rt[2]), abs(Rs[3]+Rt[3]) }
|
||||
*/
|
||||
SSE2_PSLLQ_I8_to_XMM(t0reg, 1);
|
||||
SSE2_PSLLQ_I8_to_XMM(t1reg, 1);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(t3reg, t3reg);
|
||||
SSE2_PSRLD_I8_to_XMM(t3reg, 1);
|
||||
SSE2_PXOR_XMM_to_XMM(t2reg, t3reg);
|
||||
SSE_SHUFPS_XMM_to_XMM(t0reg, t1reg, 0xdd);
|
||||
SSE2_PXOR_XMM_to_XMM(t1reg, t1reg);
|
||||
SSE2_PCMPEQD_XMM_to_XMM(t1reg, t0reg);
|
||||
/*
|
||||
t1reg = { abs(Rs[0]+Rt[0]) > 0x7FFFFFFF ? 0 : 0xFFFFFFFF,
|
||||
abs(Rs[1]+Rt[1]) > 0x7FFFFFFF ? 0 : 0xFFFFFFFF,
|
||||
abs(Rs[2]+Rt[2]) > 0x7FFFFFFF ? 0 : 0xFFFFFFFF,
|
||||
abs(Rs[3]+Rt[3]) > 0x7FFFFFFF ? 0 : 0xFFFFFFFF }
|
||||
t2reg = { (Rs[0]+Rt[0]) < 0 ? 0x80000000 : 0x7FFFFFFF,
|
||||
(Rs[1]+Rt[1]) < 0 ? 0x80000000 : 0x7FFFFFFF,
|
||||
(Rs[2]+Rt[2]) < 0 ? 0x80000000 : 0x7FFFFFFF,
|
||||
(Rs[3]+Rt[3]) < 0 ? 0x80000000 : 0x7FFFFFFF }
|
||||
*/
|
||||
if( EEREC_D == EEREC_S ) SSE2_PADDD_XMM_to_XMM(EEREC_D, EEREC_T);
|
||||
else if( EEREC_D == EEREC_T ) SSE2_PADDD_XMM_to_XMM(EEREC_D, EEREC_S);
|
||||
else {
|
||||
SSEX_MOVDQA_XMM_to_XMM(EEREC_D, EEREC_S);
|
||||
SSE2_PADDD_XMM_to_XMM(EEREC_D, EEREC_T);
|
||||
}
|
||||
SSE2_PAND_XMM_to_XMM(EEREC_D, t1reg);
|
||||
SSE2_PANDN_XMM_to_XMM(t1reg, t2reg);
|
||||
SSE2_POR_XMM_to_XMM(EEREC_D, t1reg);
|
||||
/*
|
||||
Rd = { t1reg[0] ? Rs[0]+Rt[0] : t2reg[0],
|
||||
t1reg[1] ? Rs[1]+Rt[1] : t2reg[1],
|
||||
t1reg[2] ? Rs[2]+Rt[2] : t2reg[2],
|
||||
t1reg[3] ? Rs[3]+Rt[3] : t2reg[3] }
|
||||
*/
|
||||
_freeXMMreg(t0reg);
|
||||
_freeXMMreg(t1reg);
|
||||
_freeXMMreg(t2reg);
|
||||
_freeXMMreg(t3reg);
|
||||
CPU_SSE_XMMCACHE_END
|
||||
|
||||
if( _Rd_ ) _deleteEEreg(_Rd_, 0);
|
||||
_deleteEEreg(_Rs_, 1);
|
||||
|
@ -1028,8 +1124,76 @@ void recPSUBSW( void )
|
|||
{
|
||||
if ( ! _Rd_ ) return;
|
||||
|
||||
//CPU_SSE2_XMMCACHE_START(XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED)
|
||||
//CPU_SSE_XMMCACHE_END
|
||||
CPU_SSE2_XMMCACHE_START(XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED)
|
||||
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
|
||||
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
|
||||
int t2reg = _allocTempXMMreg(XMMT_INT, -1);
|
||||
int t3reg = _allocTempXMMreg(XMMT_INT, -1);
|
||||
|
||||
if ( cpucaps.hasStreamingSIMD4Extensions ) {
|
||||
SSE4_PMOVSXDQ_XMM_to_XMM(t0reg, EEREC_S);
|
||||
SSE4_PMOVSXDQ_XMM_to_XMM(t1reg, EEREC_T);
|
||||
SSE2_PSUBQ_XMM_to_XMM(t0reg, t1reg);
|
||||
SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0x0e);
|
||||
SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0x0e);
|
||||
SSE4_PMOVSXDQ_XMM_to_XMM(t1reg, t1reg);
|
||||
SSE4_PMOVSXDQ_XMM_to_XMM(t2reg, t2reg);
|
||||
}
|
||||
else {
|
||||
SSEX_MOVDQA_XMM_to_XMM(t0reg, EEREC_S);
|
||||
SSEX_MOVDQA_XMM_to_XMM(t1reg, EEREC_T);
|
||||
SSE2_PXOR_XMM_to_XMM(t2reg, t2reg);
|
||||
SSE2_PXOR_XMM_to_XMM(t3reg, t3reg);
|
||||
SSE2_PCMPGTD_XMM_to_XMM(t2reg, t0reg);
|
||||
SSE2_PCMPGTD_XMM_to_XMM(t3reg, t1reg);
|
||||
SSE2_PUNPCKLDQ_XMM_to_XMM(t0reg, t2reg);
|
||||
SSE2_PUNPCKLDQ_XMM_to_XMM(t1reg, t3reg);
|
||||
SSE2_PSUBQ_XMM_to_XMM(t0reg, t1reg);
|
||||
SSEX_MOVDQA_XMM_to_XMM(t1reg, EEREC_S);
|
||||
SSE2_PUNPCKHDQ_XMM_to_XMM(t1reg, t2reg);
|
||||
SSEX_MOVDQA_XMM_to_XMM(t2reg, EEREC_T);
|
||||
SSE2_PUNPCKHDQ_XMM_to_XMM(t2reg, t3reg);
|
||||
}
|
||||
SSE2_PSUBQ_XMM_to_XMM(t1reg, t2reg);
|
||||
|
||||
SSEX_MOVDQA_XMM_to_XMM(t2reg, t0reg);
|
||||
SSE_SHUFPS_XMM_to_XMM(t2reg, t1reg, 0xdd);
|
||||
SSE2_PSRAD_I8_to_XMM(t2reg, 31);
|
||||
|
||||
SSE2_PSHUFD_XMM_to_XMM(t3reg, t2reg, 0x50);
|
||||
SSE2_PXOR_XMM_to_XMM(t0reg, t3reg);
|
||||
SSE2_PSRLQ_I8_to_XMM(t3reg, 63);
|
||||
SSE2_PADDQ_XMM_to_XMM(t0reg, t3reg);
|
||||
SSE2_PSHUFD_XMM_to_XMM(t3reg, t2reg, 0xfa);
|
||||
SSE2_PXOR_XMM_to_XMM(t1reg, t3reg);
|
||||
SSE2_PSRLQ_I8_to_XMM(t3reg, 63);
|
||||
SSE2_PADDQ_XMM_to_XMM(t1reg, t3reg);
|
||||
SSE2_PSLLQ_I8_to_XMM(t0reg, 1);
|
||||
SSE2_PSLLQ_I8_to_XMM(t1reg, 1);
|
||||
SSE2_PCMPEQB_XMM_to_XMM(t3reg, t3reg);
|
||||
SSE2_PSRLD_I8_to_XMM(t3reg, 1);
|
||||
SSE2_PXOR_XMM_to_XMM(t2reg, t3reg);
|
||||
SSE_SHUFPS_XMM_to_XMM(t0reg, t1reg, 0xdd);
|
||||
SSE2_PXOR_XMM_to_XMM(t1reg, t1reg);
|
||||
SSE2_PCMPEQD_XMM_to_XMM(t1reg, t0reg);
|
||||
if( EEREC_D == EEREC_S ) SSE2_PSUBD_XMM_to_XMM(EEREC_D, EEREC_T);
|
||||
else if( EEREC_D == EEREC_T ) {
|
||||
SSEX_MOVDQA_XMM_to_XMM(t0reg, EEREC_T);
|
||||
SSEX_MOVDQA_XMM_to_XMM(EEREC_D, EEREC_S);
|
||||
SSE2_PSUBD_XMM_to_XMM(EEREC_D, t0reg);
|
||||
}
|
||||
else {
|
||||
SSEX_MOVDQA_XMM_to_XMM(EEREC_D, EEREC_S);
|
||||
SSE2_PSUBD_XMM_to_XMM(EEREC_D, EEREC_T);
|
||||
}
|
||||
SSE2_PAND_XMM_to_XMM(EEREC_D, t1reg);
|
||||
SSE2_PANDN_XMM_to_XMM(t1reg, t2reg);
|
||||
SSE2_POR_XMM_to_XMM(EEREC_D, t1reg);
|
||||
_freeXMMreg(t0reg);
|
||||
_freeXMMreg(t1reg);
|
||||
_freeXMMreg(t2reg);
|
||||
_freeXMMreg(t3reg);
|
||||
CPU_SSE_XMMCACHE_END
|
||||
|
||||
if( _Rd_ ) _deleteEEreg(_Rd_, 0);
|
||||
_deleteEEreg(_Rs_, 1);
|
||||
|
|
|
@ -1616,6 +1616,7 @@ extern void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 i
|
|||
extern void SSE4_BLENDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8);
|
||||
extern void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
|
||||
extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from);
|
||||
extern void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
|
||||
|
||||
//*********************
|
||||
// SSE-X - uses both SSE,SSE2 code and tries to keep consistensies between the data
|
||||
|
@ -1733,4 +1734,4 @@ __forceinline void write32(u32 val )
|
|||
x86Ptr += 4;
|
||||
}
|
||||
|
||||
#endif // __IX86_H__
|
||||
#endif // __IX86_H__
|
|
@ -1135,6 +1135,14 @@ __forceinline void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from)
|
|||
write32(MEMADDR(from, 4 + overb));
|
||||
}
|
||||
|
||||
__forceinline void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
|
||||
{
|
||||
write8(0x66);
|
||||
RexRB(0, to, from);
|
||||
write24(0x25380F);
|
||||
ModRM(3, to, from);
|
||||
}
|
||||
|
||||
// SSE-X
|
||||
__forceinline void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from )
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue