EE: port MMX code to SSE for shift opcode

Code need to be enabled with a define (NO_MMX 1)

Code was tested with ps2autotest but it need more tests. I need to check
alignement issue too.

Globally code is potentially a little slower than SSE.
The trick is that we need to shift only the 64 lsb whereas SSE will
shift the full 128 bits register.

Current implementation flush the lsb and drop the full register. It is
unlikely that next intruction will be done in SSE anyway.

Note: it would be easier in x64 arch
This commit is contained in:
Gregory Hainaut 2016-02-06 15:24:26 +01:00
parent 119f6deb24
commit a9a955f8b9
2 changed files with 239 additions and 11 deletions

View File

@ -748,6 +748,7 @@ void _deleteACCtoXMMreg(int vu, int flush)
// Flush is 0: _freeXMMreg. Flush in memory if MODE_WRITE. Clear inuse
// Flush is 1: Flush in memory. But register is still valid
// Flush is 2: like 0 ...
// Flush is 3: drop register content
void _deleteGPRtoXMMreg(int reg, int flush)
{
int i;
@ -775,6 +776,10 @@ void _deleteGPRtoXMMreg(int reg, int flush)
if( flush == 2 )
xmmregs[i].inuse = 0;
break;
case 3:
xmmregs[i].inuse = 0;
break;
}
return;

View File

@ -20,6 +20,8 @@
#include "R5900OpcodeTables.h"
#include "iR5900.h"
#define NO_MMX 0
using namespace x86Emitter;
namespace R5900 {
@ -143,6 +145,22 @@ void recDSLLs_(int info, int sa)
int rtreg, rdreg;
pxAssert( !(info & PROCESS_EE_XMM) );
#if NO_MMX
_addNeededGPRtoXMMreg(_Rt_);
_addNeededGPRtoXMMreg(_Rd_);
rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
rdreg = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE);
if( rtreg != rdreg ) xMOVDQA(xRegisterSSE(rdreg), xRegisterSSE(rtreg));
xPSLL.Q(xRegisterSSE(rdreg), sa);
// flush lower 64 bits (as upper is wrong)
// The others possibility could be a read back of the upper 64 bits
// (better use of register but code will likely be flushed after anyway)
xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg));
_deleteGPRtoXMMreg(_Rt_, 3);
_deleteGPRtoXMMreg(_Rd_, 3);
#else
_addNeededMMXreg(MMX_GPR+_Rt_);
_addNeededMMXreg(MMX_GPR+_Rd_);
rtreg = _allocMMXreg(-1, MMX_GPR+_Rt_, MODE_READ);
@ -151,6 +169,7 @@ void recDSLLs_(int info, int sa)
if( rtreg != rdreg ) xMOVQ(xRegisterMMX(rdreg), xRegisterMMX(rtreg));
xPSLL.Q(xRegisterMMX(rdreg), sa);
#endif
}
void recDSLL_(int info)
@ -171,6 +190,22 @@ void recDSRLs_(int info, int sa)
int rtreg, rdreg;
pxAssert( !(info & PROCESS_EE_XMM) );
#if NO_MMX
_addNeededGPRtoXMMreg(_Rt_);
_addNeededGPRtoXMMreg(_Rd_);
rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
rdreg = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE);
if( rtreg != rdreg ) xMOVDQA(xRegisterSSE(rdreg), xRegisterSSE(rtreg));
xPSRL.Q(xRegisterSSE(rdreg), sa);
// flush lower 64 bits (as upper is wrong)
// The others possibility could be a read back of the upper 64 bits
// (better use of register but code will likely be flushed after anyway)
xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg));
_deleteGPRtoXMMreg(_Rt_, 3);
_deleteGPRtoXMMreg(_Rd_, 3);
#else
_addNeededMMXreg(MMX_GPR+_Rt_);
_addNeededMMXreg(MMX_GPR+_Rd_);
rtreg = _allocMMXreg(-1, MMX_GPR+_Rt_, MODE_READ);
@ -179,6 +214,7 @@ void recDSRLs_(int info, int sa)
if( rtreg != rdreg ) xMOVQ(xRegisterMMX(rdreg), xRegisterMMX(rtreg));
xPSRL.Q(xRegisterMMX(rdreg), sa);
#endif
}
void recDSRL_(int info)
@ -199,6 +235,42 @@ void recDSRAs_(int info, int sa)
int rtreg, rdreg, t0reg;
pxAssert( !(info & PROCESS_EE_XMM) );
#if NO_MMX
_addNeededGPRtoXMMreg(_Rt_);
_addNeededGPRtoXMMreg(_Rd_);
rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
rdreg = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE);
if( rtreg != rdreg ) xMOVDQA(xRegisterSSE(rdreg), xRegisterSSE(rtreg));
if ( sa ) {
t0reg = _allocTempXMMreg(XMMT_INT, -1);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(rtreg));
// it is a signed shift (but 64 bits operands aren't supported on 32 bits even on SSE)
xPSRA.D(xRegisterSSE(t0reg), sa);
xPSRL.Q(xRegisterSSE(rdreg), sa);
// It can be done in one blend instruction in SSE4.1
// Goal is to move 63:32 of t0reg to 63:32 rdreg
{
xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(t0reg), 0x55);
// take lower dword of rdreg and lower dword of t0reg
xPUNPCK.LDQ(xRegisterSSE(rdreg), xRegisterSSE(t0reg));
}
_freeXMMreg(t0reg);
}
// flush lower 64 bits (as upper is wrong)
// The others possibility could be a read back of the upper 64 bits
// (better use of register but code will likely be flushed after anyway)
xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg));
_deleteGPRtoXMMreg(_Rt_, 3);
_deleteGPRtoXMMreg(_Rd_, 3);
#else
_addNeededMMXreg(MMX_GPR+_Rt_);
_addNeededMMXreg(MMX_GPR+_Rd_);
rtreg = _allocMMXreg(-1, MMX_GPR+_Rt_, MODE_READ);
@ -221,6 +293,7 @@ void recDSRAs_(int info, int sa)
_freeMMXreg(t0reg);
}
#endif
}
void recDSRA_(int info)
@ -314,10 +387,25 @@ EERECOMPILE_CODEX(eeRecompileCode2, DSRA32);
__aligned16 u32 s_sa[4] = {0x1f, 0, 0x3f, 0};
int recSetShiftV(int info, int* rsreg, int* rtreg, int* rdreg, int* rstemp, int shift64)
void recSetShiftV(int info, int* rsreg, int* rtreg, int* rdreg, int* rstemp)
{
pxAssert( !(info & PROCESS_EE_XMM) );
#if NO_MMX
_addNeededGPRtoXMMreg(_Rt_);
_addNeededGPRtoXMMreg(_Rd_);
*rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ);
*rdreg = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE);
*rstemp = _allocTempXMMreg(XMMT_INT, -1);
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
xAND(eax, 0x3f);
xMOVDZX(xRegisterSSE(*rstemp), eax);
*rsreg = *rstemp;
if( *rtreg != *rdreg ) xMOVDQA(xRegisterSSE(*rdreg), xRegisterSSE(*rtreg));
#else
_addNeededMMXreg(MMX_GPR+_Rt_);
_addNeededMMXreg(MMX_GPR+_Rd_);
*rtreg = _allocMMXreg(-1, MMX_GPR+_Rt_, MODE_READ);
@ -326,27 +414,41 @@ int recSetShiftV(int info, int* rsreg, int* rtreg, int* rdreg, int* rstemp, int
*rstemp = _allocMMXreg(-1, MMX_TEMP, 0);
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
xAND(eax, shift64?0x3f:0x1f);
xAND(eax, 0x3f);
xMOVDZX(xRegisterMMX(*rstemp), eax);
*rsreg = *rstemp;
if( *rtreg != *rdreg ) xMOVQ(xRegisterMMX(*rdreg), xRegisterMMX(*rtreg));
return 1;
#endif
}
void recSetConstShiftV(int info, int* rsreg, int* rdreg, int* rstemp, int shift64)
void recSetConstShiftV(int info, int* rsreg, int* rdreg, int* rstemp)
{
#if NO_MMX
_addNeededGPRtoXMMreg(_Rd_);
*rdreg = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE);
*rstemp = _allocTempXMMreg(XMMT_INT, -1);
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
xAND(eax, 0x3f);
xMOVDZX(xRegisterSSE(*rstemp), eax);
*rsreg = *rstemp;
_flushConstReg(_Rt_);
#else
_addNeededMMXreg(MMX_GPR+_Rd_);
*rdreg = _allocMMXreg(-1, MMX_GPR+_Rd_, MODE_WRITE);
SetMMXstate();
*rstemp = _allocMMXreg(-1, MMX_TEMP, 0);
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]);
xAND(eax, shift64?0x3f:0x1f);
xAND(eax, 0x3f);
xMOVDZX(xRegisterMMX(*rstemp), eax);
*rsreg = *rstemp;
_flushConstReg(_Rt_);
#endif
}
//// SLLV
@ -479,20 +581,45 @@ void recDSLLV_consts(int info)
void recDSLLV_constt(int info)
{
int rsreg, rdreg, rstemp = -1;
recSetConstShiftV(info, &rsreg, &rdreg, &rstemp, 1);
recSetConstShiftV(info, &rsreg, &rdreg, &rstemp);
#if NO_MMX
xMOVDQA(xRegisterSSE(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]);
xPSLL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg));
if( rstemp != -1 ) _freeXMMreg(rstemp);
// flush lower 64 bits (as upper is wrong)
// The others possibility could be a read back of the upper 64 bits
// (better use of register but code will likely be flushed after anyway)
xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg));
_deleteGPRtoXMMreg(_Rt_, 3);
_deleteGPRtoXMMreg(_Rd_, 3);
#else
xMOVQ(xRegisterMMX(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]);
xPSLL.Q(xRegisterMMX(rdreg), xRegisterMMX(rsreg));
if( rstemp != -1 ) _freeMMXreg(rstemp);
#endif
}
void recDSLLV_(int info)
{
int rsreg, rtreg, rdreg, rstemp = -1;
recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp, 1);
recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp);
#if NO_MMX
xPSLL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg));
if( rstemp != -1 ) _freeXMMreg(rstemp);
// flush lower 64 bits (as upper is wrong)
// The others possibility could be a read back of the upper 64 bits
// (better use of register but code will likely be flushed after anyway)
xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg));
_deleteGPRtoXMMreg(_Rt_, 3);
_deleteGPRtoXMMreg(_Rd_, 3);
#else
xPSLL.Q(xRegisterMMX(rdreg), xRegisterMMX(rsreg));
if( rstemp != -1 ) _freeMMXreg(rstemp);
#endif
}
EERECOMPILE_CODE0(DSLLV, XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED);
@ -513,20 +640,45 @@ void recDSRLV_consts(int info)
void recDSRLV_constt(int info)
{
int rsreg, rdreg, rstemp = -1;
recSetConstShiftV(info, &rsreg, &rdreg, &rstemp, 1);
recSetConstShiftV(info, &rsreg, &rdreg, &rstemp);
#if NO_MMX
xMOVDQA(xRegisterSSE(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]);
xPSRL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg));
if( rstemp != -1 ) _freeXMMreg(rstemp);
// flush lower 64 bits (as upper is wrong)
// The others possibility could be a read back of the upper 64 bits
// (better use of register but code will likely be flushed after anyway)
xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg));
_deleteGPRtoXMMreg(_Rt_, 3);
_deleteGPRtoXMMreg(_Rd_, 3);
#else
xMOVQ(xRegisterMMX(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]);
xPSRL.Q(xRegisterMMX(rdreg), xRegisterMMX(rsreg));
if( rstemp != -1 ) _freeMMXreg(rstemp);
#endif
}
void recDSRLV_(int info)
{
int rsreg, rtreg, rdreg, rstemp = -1;
recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp, 1);
recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp);
#if NO_MMX
xPSRL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg));
if( rstemp != -1 ) _freeXMMreg(rstemp);
// flush lower 64 bits (as upper is wrong)
// The others possibility could be a read back of the upper 64 bits
// (better use of register but code will likely be flushed after anyway)
xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg));
_deleteGPRtoXMMreg(_Rt_, 3);
_deleteGPRtoXMMreg(_Rd_, 3);
#else
xPSRL.Q(xRegisterMMX(rdreg), xRegisterMMX(rsreg));
if( rstemp != -1 ) _freeMMXreg(rstemp);
#endif
}
EERECOMPILE_CODE0(DSRLV, XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED);
@ -547,10 +699,45 @@ void recDSRAV_consts(int info)
void recDSRAV_constt(int info)
{
int rsreg, rdreg, rstemp = -1, t0reg, t1reg;
#if NO_MMX
t0reg = _allocTempXMMreg(XMMT_INT, -1);
t1reg = _allocTempXMMreg(XMMT_INT, -1);
recSetConstShiftV(info, &rsreg, &rdreg, &rstemp);
xMOVDQA(xRegisterSSE(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]);
xPXOR(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
// calc high bit
xMOVDQA(xRegisterSSE(t1reg), xRegisterSSE(rdreg));
xPCMP.GTD(xRegisterSSE(t0reg), xRegisterSSE(rdreg));
xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(t0reg), 0x55);
// shift highest bit, 64 - eax
xMOV(eax, 64);
xMOVDZX(xRegisterSSE(t1reg), eax);
xPSUB.D(xRegisterSSE(t1reg), xRegisterSSE(rsreg));
// right logical shift
xPSRL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg));
xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg)); // highest bits
xPOR(xRegisterSSE(rdreg), xRegisterSSE(t0reg));
// flush lower 64 bits (as upper is wrong)
// The others possibility could be a read back of the upper 64 bits
// (better use of register but code will likely be flushed after anyway)
xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg));
_deleteGPRtoXMMreg(_Rd_, 3);
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
if( rstemp != -1 ) _freeXMMreg(rstemp);
#else
t0reg = _allocMMXreg(-1, MMX_TEMP, 0);
t1reg = _allocMMXreg(-1, MMX_TEMP, 0);
recSetConstShiftV(info, &rsreg, &rdreg, &rstemp, 1);
recSetConstShiftV(info, &rsreg, &rdreg, &rstemp);
xMOVQ(xRegisterMMX(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]);
xPXOR(xRegisterMMX(t0reg), xRegisterMMX(t0reg));
@ -574,14 +761,49 @@ void recDSRAV_constt(int info)
_freeMMXreg(t0reg);
_freeMMXreg(t1reg);
if( rstemp != -1 ) _freeMMXreg(rstemp);
#endif
}
void recDSRAV_(int info)
{
int rsreg, rtreg, rdreg, rstemp = -1, t0reg, t1reg;
#if NO_MMX
t0reg = _allocTempXMMreg(XMMT_INT, -1);
t1reg = _allocTempXMMreg(XMMT_INT, -1);
recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp);
xPXOR(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
// calc high bit
xMOVDQA(xRegisterSSE(t1reg), xRegisterSSE(rdreg));
xPCMP.GTD(xRegisterSSE(t0reg), xRegisterSSE(rdreg));
xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(t0reg), 0x55);
// shift highest bit, 64 - eax
xMOV(eax, 64);
xMOVDZX(xRegisterSSE(t1reg), eax);
xPSUB.D(xRegisterSSE(t1reg), xRegisterSSE(rsreg));
// right logical shift
xPSRL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg));
xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg)); // highest bits
xPOR(xRegisterSSE(rdreg), xRegisterSSE(t0reg));
// flush lower 64 bits (as upper is wrong)
// The others possibility could be a read back of the upper 64 bits
// (better use of register but code will likely be flushed after anyway)
xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg));
_deleteGPRtoXMMreg(_Rt_, 3);
_deleteGPRtoXMMreg(_Rd_, 3);
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
if( rstemp != -1 ) _freeXMMreg(rstemp);
#else
t0reg = _allocMMXreg(-1, MMX_TEMP, 0);
t1reg = _allocMMXreg(-1, MMX_TEMP, 0);
recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp, 1);
recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp);
xPXOR(xRegisterMMX(t0reg), xRegisterMMX(t0reg));
@ -604,6 +826,7 @@ void recDSRAV_(int info)
_freeMMXreg(t0reg);
_freeMMXreg(t1reg);
if( rstemp != -1 ) _freeMMXreg(rstemp);
#endif
}
EERECOMPILE_CODE0(DSRAV, XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED);