Added tmkk's PADDSW/PSUBSW opcodes which replace the slower interpreter ones we used so far. Great job on those tmkk!

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@613 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
ramapcsx2 2009-01-20 21:34:17 +00:00 committed by Gregory Hainaut
parent d0a7068f2e
commit cf156d3c27
3 changed files with 178 additions and 5 deletions

View File

@ -937,8 +937,104 @@ void recPADDSW( void )
{
if ( ! _Rd_ ) return;
//CPU_SSE2_XMMCACHE_START(XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED)
//CPU_SSE_XMMCACHE_END
CPU_SSE2_XMMCACHE_START(XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED)
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t2reg = _allocTempXMMreg(XMMT_INT, -1);
int t3reg = _allocTempXMMreg(XMMT_INT, -1);
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE4_PMOVSXDQ_XMM_to_XMM(t0reg, EEREC_S);
SSE4_PMOVSXDQ_XMM_to_XMM(t1reg, EEREC_T);
SSE2_PADDQ_XMM_to_XMM(t0reg, t1reg);
SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0x0e);
SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0x0e);
SSE4_PMOVSXDQ_XMM_to_XMM(t1reg, t1reg);
SSE4_PMOVSXDQ_XMM_to_XMM(t2reg, t2reg);
}
else {
SSEX_MOVDQA_XMM_to_XMM(t0reg, EEREC_S);
SSEX_MOVDQA_XMM_to_XMM(t1reg, EEREC_T);
SSE2_PXOR_XMM_to_XMM(t2reg, t2reg);
SSE2_PXOR_XMM_to_XMM(t3reg, t3reg);
SSE2_PCMPGTD_XMM_to_XMM(t2reg, t0reg);
SSE2_PCMPGTD_XMM_to_XMM(t3reg, t1reg);
SSE2_PUNPCKLDQ_XMM_to_XMM(t0reg, t2reg);
SSE2_PUNPCKLDQ_XMM_to_XMM(t1reg, t3reg);
SSE2_PADDQ_XMM_to_XMM(t0reg, t1reg);
SSEX_MOVDQA_XMM_to_XMM(t1reg, EEREC_S);
SSE2_PUNPCKHDQ_XMM_to_XMM(t1reg, t2reg);
SSEX_MOVDQA_XMM_to_XMM(t2reg, EEREC_T);
SSE2_PUNPCKHDQ_XMM_to_XMM(t2reg, t3reg);
}
SSE2_PADDQ_XMM_to_XMM(t1reg, t2reg);
/*
t0reg = { Rs[0]+Rt[0], Rs[1]+Rt[1] }
t1reg = { Rs[2]+Rt[2], Rs[3]+Rt[3] }
*/
SSEX_MOVDQA_XMM_to_XMM(t2reg, t0reg);
SSE_SHUFPS_XMM_to_XMM(t2reg, t1reg, 0xdd);
SSE2_PSRAD_I8_to_XMM(t2reg, 31);
/*
t2reg = { (Rs[0]+Rt[0]) < 0 ? 0xFFFFFFFF : 0,
(Rs[1]+Rt[1]) < 0 ? 0xFFFFFFFF : 0,
(Rs[2]+Rt[2]) < 0 ? 0xFFFFFFFF : 0,
(Rs[3]+Rt[3]) < 0 ? 0xFFFFFFFF : 0 }
*/
SSE2_PSHUFD_XMM_to_XMM(t3reg, t2reg, 0x50);
SSE2_PXOR_XMM_to_XMM(t0reg, t3reg);
SSE2_PSRLQ_I8_to_XMM(t3reg, 63);
SSE2_PADDQ_XMM_to_XMM(t0reg, t3reg);
/*
t0reg = { abs(Rs[0]+Rt[0]), abs(Rs[1]+Rt[1]) }
*/
SSE2_PSHUFD_XMM_to_XMM(t3reg, t2reg, 0xfa);
SSE2_PXOR_XMM_to_XMM(t1reg, t3reg);
SSE2_PSRLQ_I8_to_XMM(t3reg, 63);
SSE2_PADDQ_XMM_to_XMM(t1reg, t3reg);
/*
t1reg = { abs(Rs[2]+Rt[2]), abs(Rs[3]+Rt[3]) }
*/
SSE2_PSLLQ_I8_to_XMM(t0reg, 1);
SSE2_PSLLQ_I8_to_XMM(t1reg, 1);
SSE2_PCMPEQB_XMM_to_XMM(t3reg, t3reg);
SSE2_PSRLD_I8_to_XMM(t3reg, 1);
SSE2_PXOR_XMM_to_XMM(t2reg, t3reg);
SSE_SHUFPS_XMM_to_XMM(t0reg, t1reg, 0xdd);
SSE2_PXOR_XMM_to_XMM(t1reg, t1reg);
SSE2_PCMPEQD_XMM_to_XMM(t1reg, t0reg);
/*
t1reg = { abs(Rs[0]+Rt[0]) > 0x7FFFFFFF ? 0 : 0xFFFFFFFF,
abs(Rs[1]+Rt[1]) > 0x7FFFFFFF ? 0 : 0xFFFFFFFF,
abs(Rs[2]+Rt[2]) > 0x7FFFFFFF ? 0 : 0xFFFFFFFF,
abs(Rs[3]+Rt[3]) > 0x7FFFFFFF ? 0 : 0xFFFFFFFF }
t2reg = { (Rs[0]+Rt[0]) < 0 ? 0x80000000 : 0x7FFFFFFF,
(Rs[1]+Rt[1]) < 0 ? 0x80000000 : 0x7FFFFFFF,
(Rs[2]+Rt[2]) < 0 ? 0x80000000 : 0x7FFFFFFF,
(Rs[3]+Rt[3]) < 0 ? 0x80000000 : 0x7FFFFFFF }
*/
if( EEREC_D == EEREC_S ) SSE2_PADDD_XMM_to_XMM(EEREC_D, EEREC_T);
else if( EEREC_D == EEREC_T ) SSE2_PADDD_XMM_to_XMM(EEREC_D, EEREC_S);
else {
SSEX_MOVDQA_XMM_to_XMM(EEREC_D, EEREC_S);
SSE2_PADDD_XMM_to_XMM(EEREC_D, EEREC_T);
}
SSE2_PAND_XMM_to_XMM(EEREC_D, t1reg);
SSE2_PANDN_XMM_to_XMM(t1reg, t2reg);
SSE2_POR_XMM_to_XMM(EEREC_D, t1reg);
/*
Rd = { t1reg[0] ? Rs[0]+Rt[0] : t2reg[0],
t1reg[1] ? Rs[1]+Rt[1] : t2reg[1],
t1reg[2] ? Rs[2]+Rt[2] : t2reg[2],
t1reg[3] ? Rs[3]+Rt[3] : t2reg[3] }
*/
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
_freeXMMreg(t2reg);
_freeXMMreg(t3reg);
CPU_SSE_XMMCACHE_END
if( _Rd_ ) _deleteEEreg(_Rd_, 0);
_deleteEEreg(_Rs_, 1);
@ -1028,8 +1124,76 @@ void recPSUBSW( void )
{
if ( ! _Rd_ ) return;
//CPU_SSE2_XMMCACHE_START(XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED)
//CPU_SSE_XMMCACHE_END
CPU_SSE2_XMMCACHE_START(XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED)
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
int t2reg = _allocTempXMMreg(XMMT_INT, -1);
int t3reg = _allocTempXMMreg(XMMT_INT, -1);
if ( cpucaps.hasStreamingSIMD4Extensions ) {
SSE4_PMOVSXDQ_XMM_to_XMM(t0reg, EEREC_S);
SSE4_PMOVSXDQ_XMM_to_XMM(t1reg, EEREC_T);
SSE2_PSUBQ_XMM_to_XMM(t0reg, t1reg);
SSE2_PSHUFD_XMM_to_XMM(t1reg, EEREC_S, 0x0e);
SSE2_PSHUFD_XMM_to_XMM(t2reg, EEREC_T, 0x0e);
SSE4_PMOVSXDQ_XMM_to_XMM(t1reg, t1reg);
SSE4_PMOVSXDQ_XMM_to_XMM(t2reg, t2reg);
}
else {
SSEX_MOVDQA_XMM_to_XMM(t0reg, EEREC_S);
SSEX_MOVDQA_XMM_to_XMM(t1reg, EEREC_T);
SSE2_PXOR_XMM_to_XMM(t2reg, t2reg);
SSE2_PXOR_XMM_to_XMM(t3reg, t3reg);
SSE2_PCMPGTD_XMM_to_XMM(t2reg, t0reg);
SSE2_PCMPGTD_XMM_to_XMM(t3reg, t1reg);
SSE2_PUNPCKLDQ_XMM_to_XMM(t0reg, t2reg);
SSE2_PUNPCKLDQ_XMM_to_XMM(t1reg, t3reg);
SSE2_PSUBQ_XMM_to_XMM(t0reg, t1reg);
SSEX_MOVDQA_XMM_to_XMM(t1reg, EEREC_S);
SSE2_PUNPCKHDQ_XMM_to_XMM(t1reg, t2reg);
SSEX_MOVDQA_XMM_to_XMM(t2reg, EEREC_T);
SSE2_PUNPCKHDQ_XMM_to_XMM(t2reg, t3reg);
}
SSE2_PSUBQ_XMM_to_XMM(t1reg, t2reg);
SSEX_MOVDQA_XMM_to_XMM(t2reg, t0reg);
SSE_SHUFPS_XMM_to_XMM(t2reg, t1reg, 0xdd);
SSE2_PSRAD_I8_to_XMM(t2reg, 31);
SSE2_PSHUFD_XMM_to_XMM(t3reg, t2reg, 0x50);
SSE2_PXOR_XMM_to_XMM(t0reg, t3reg);
SSE2_PSRLQ_I8_to_XMM(t3reg, 63);
SSE2_PADDQ_XMM_to_XMM(t0reg, t3reg);
SSE2_PSHUFD_XMM_to_XMM(t3reg, t2reg, 0xfa);
SSE2_PXOR_XMM_to_XMM(t1reg, t3reg);
SSE2_PSRLQ_I8_to_XMM(t3reg, 63);
SSE2_PADDQ_XMM_to_XMM(t1reg, t3reg);
SSE2_PSLLQ_I8_to_XMM(t0reg, 1);
SSE2_PSLLQ_I8_to_XMM(t1reg, 1);
SSE2_PCMPEQB_XMM_to_XMM(t3reg, t3reg);
SSE2_PSRLD_I8_to_XMM(t3reg, 1);
SSE2_PXOR_XMM_to_XMM(t2reg, t3reg);
SSE_SHUFPS_XMM_to_XMM(t0reg, t1reg, 0xdd);
SSE2_PXOR_XMM_to_XMM(t1reg, t1reg);
SSE2_PCMPEQD_XMM_to_XMM(t1reg, t0reg);
if( EEREC_D == EEREC_S ) SSE2_PSUBD_XMM_to_XMM(EEREC_D, EEREC_T);
else if( EEREC_D == EEREC_T ) {
SSEX_MOVDQA_XMM_to_XMM(t0reg, EEREC_T);
SSEX_MOVDQA_XMM_to_XMM(EEREC_D, EEREC_S);
SSE2_PSUBD_XMM_to_XMM(EEREC_D, t0reg);
}
else {
SSEX_MOVDQA_XMM_to_XMM(EEREC_D, EEREC_S);
SSE2_PSUBD_XMM_to_XMM(EEREC_D, EEREC_T);
}
SSE2_PAND_XMM_to_XMM(EEREC_D, t1reg);
SSE2_PANDN_XMM_to_XMM(t1reg, t2reg);
SSE2_POR_XMM_to_XMM(EEREC_D, t1reg);
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
_freeXMMreg(t2reg);
_freeXMMreg(t3reg);
CPU_SSE_XMMCACHE_END
if( _Rd_ ) _deleteEEreg(_Rd_, 0);
_deleteEEreg(_Rs_, 1);

View File

@ -1616,6 +1616,7 @@ extern void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 i
extern void SSE4_BLENDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8);
extern void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from);
extern void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from);
//*********************
// SSE-X - uses both SSE,SSE2 code and tries to keep consistensies between the data
@ -1733,4 +1734,4 @@ __forceinline void write32(u32 val )
x86Ptr += 4;
}
#endif // __IX86_H__
#endif // __IX86_H__

View File

@ -1135,6 +1135,14 @@ __forceinline void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from)
write32(MEMADDR(from, 4 + overb));
}
__forceinline void SSE4_PMOVSXDQ_XMM_to_XMM(x86SSERegType to, x86SSERegType from)
{
write8(0x66);
RexRB(0, to, from);
write24(0x25380F);
ModRM(3, to, from);
}
// SSE-X
__forceinline void SSEX_MOVDQA_M128_to_XMM( x86SSERegType to, uptr from )
{