diff --git a/pcsx2/x86/iCore.cpp b/pcsx2/x86/iCore.cpp index a3125f0752..557365642f 100644 --- a/pcsx2/x86/iCore.cpp +++ b/pcsx2/x86/iCore.cpp @@ -748,6 +748,7 @@ void _deleteACCtoXMMreg(int vu, int flush) // Flush is 0: _freeXMMreg. Flush in memory if MODE_WRITE. Clear inuse // Flush is 1: Flush in memory. But register is still valid // Flush is 2: like 0 ... +// Flush is 3: drop register content void _deleteGPRtoXMMreg(int reg, int flush) { int i; @@ -775,6 +776,10 @@ void _deleteGPRtoXMMreg(int reg, int flush) if( flush == 2 ) xmmregs[i].inuse = 0; break; + + case 3: + xmmregs[i].inuse = 0; + break; } return; diff --git a/pcsx2/x86/ix86-32/iR5900Shift.cpp b/pcsx2/x86/ix86-32/iR5900Shift.cpp index 21ed7c778f..757d748cd1 100644 --- a/pcsx2/x86/ix86-32/iR5900Shift.cpp +++ b/pcsx2/x86/ix86-32/iR5900Shift.cpp @@ -20,6 +20,8 @@ #include "R5900OpcodeTables.h" #include "iR5900.h" +#define NO_MMX 0 + using namespace x86Emitter; namespace R5900 { @@ -143,6 +145,22 @@ void recDSLLs_(int info, int sa) int rtreg, rdreg; pxAssert( !(info & PROCESS_EE_XMM) ); +#if NO_MMX + _addNeededGPRtoXMMreg(_Rt_); + _addNeededGPRtoXMMreg(_Rd_); + rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ); + rdreg = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE); + + if( rtreg != rdreg ) xMOVDQA(xRegisterSSE(rdreg), xRegisterSSE(rtreg)); + xPSLL.Q(xRegisterSSE(rdreg), sa); + + // flush lower 64 bits (as upper is wrong) + // The others possibility could be a read back of the upper 64 bits + // (better use of register but code will likely be flushed after anyway) + xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg)); + _deleteGPRtoXMMreg(_Rt_, 3); + _deleteGPRtoXMMreg(_Rd_, 3); +#else _addNeededMMXreg(MMX_GPR+_Rt_); _addNeededMMXreg(MMX_GPR+_Rd_); rtreg = _allocMMXreg(-1, MMX_GPR+_Rt_, MODE_READ); @@ -151,6 +169,7 @@ void recDSLLs_(int info, int sa) if( rtreg != rdreg ) xMOVQ(xRegisterMMX(rdreg), xRegisterMMX(rtreg)); xPSLL.Q(xRegisterMMX(rdreg), sa); +#endif } void recDSLL_(int info) @@ -171,6 +190,22 @@ void recDSRLs_(int info, int sa) int rtreg, rdreg; pxAssert( !(info & PROCESS_EE_XMM) ); +#if NO_MMX + _addNeededGPRtoXMMreg(_Rt_); + _addNeededGPRtoXMMreg(_Rd_); + rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ); + rdreg = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE); + + if( rtreg != rdreg ) xMOVDQA(xRegisterSSE(rdreg), xRegisterSSE(rtreg)); + xPSRL.Q(xRegisterSSE(rdreg), sa); + + // flush lower 64 bits (as upper is wrong) + // The others possibility could be a read back of the upper 64 bits + // (better use of register but code will likely be flushed after anyway) + xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg)); + _deleteGPRtoXMMreg(_Rt_, 3); + _deleteGPRtoXMMreg(_Rd_, 3); +#else _addNeededMMXreg(MMX_GPR+_Rt_); _addNeededMMXreg(MMX_GPR+_Rd_); rtreg = _allocMMXreg(-1, MMX_GPR+_Rt_, MODE_READ); @@ -179,6 +214,7 @@ void recDSRLs_(int info, int sa) if( rtreg != rdreg ) xMOVQ(xRegisterMMX(rdreg), xRegisterMMX(rtreg)); xPSRL.Q(xRegisterMMX(rdreg), sa); +#endif } void recDSRL_(int info) @@ -199,6 +235,42 @@ void recDSRAs_(int info, int sa) int rtreg, rdreg, t0reg; pxAssert( !(info & PROCESS_EE_XMM) ); +#if NO_MMX + _addNeededGPRtoXMMreg(_Rt_); + _addNeededGPRtoXMMreg(_Rd_); + rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ); + rdreg = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE); + + if( rtreg != rdreg ) xMOVDQA(xRegisterSSE(rdreg), xRegisterSSE(rtreg)); + + if ( sa ) { + + t0reg = _allocTempXMMreg(XMMT_INT, -1); + + xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(rtreg)); + + // it is a signed shift (but 64 bits operands aren't supported on 32 bits even on SSE) + xPSRA.D(xRegisterSSE(t0reg), sa); + xPSRL.Q(xRegisterSSE(rdreg), sa); + + // It can be done in one blend instruction in SSE4.1 + // Goal is to move 63:32 of t0reg to 63:32 rdreg + { + xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(t0reg), 0x55); + // take lower dword of rdreg and lower dword of t0reg + xPUNPCK.LDQ(xRegisterSSE(rdreg), xRegisterSSE(t0reg)); + } + + _freeXMMreg(t0reg); + } + + // flush lower 64 bits (as upper is wrong) + // The others possibility could be a read back of the upper 64 bits + // (better use of register but code will likely be flushed after anyway) + xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg)); + _deleteGPRtoXMMreg(_Rt_, 3); + _deleteGPRtoXMMreg(_Rd_, 3); +#else _addNeededMMXreg(MMX_GPR+_Rt_); _addNeededMMXreg(MMX_GPR+_Rd_); rtreg = _allocMMXreg(-1, MMX_GPR+_Rt_, MODE_READ); @@ -221,6 +293,7 @@ void recDSRAs_(int info, int sa) _freeMMXreg(t0reg); } +#endif } void recDSRA_(int info) @@ -314,10 +387,25 @@ EERECOMPILE_CODEX(eeRecompileCode2, DSRA32); __aligned16 u32 s_sa[4] = {0x1f, 0, 0x3f, 0}; -int recSetShiftV(int info, int* rsreg, int* rtreg, int* rdreg, int* rstemp, int shift64) +void recSetShiftV(int info, int* rsreg, int* rtreg, int* rdreg, int* rstemp) { pxAssert( !(info & PROCESS_EE_XMM) ); +#if NO_MMX + _addNeededGPRtoXMMreg(_Rt_); + _addNeededGPRtoXMMreg(_Rd_); + *rtreg = _allocGPRtoXMMreg(-1, _Rt_, MODE_READ); + *rdreg = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE); + + *rstemp = _allocTempXMMreg(XMMT_INT, -1); + + xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]); + xAND(eax, 0x3f); + xMOVDZX(xRegisterSSE(*rstemp), eax); + *rsreg = *rstemp; + + if( *rtreg != *rdreg ) xMOVDQA(xRegisterSSE(*rdreg), xRegisterSSE(*rtreg)); +#else _addNeededMMXreg(MMX_GPR+_Rt_); _addNeededMMXreg(MMX_GPR+_Rd_); *rtreg = _allocMMXreg(-1, MMX_GPR+_Rt_, MODE_READ); @@ -326,27 +414,41 @@ int recSetShiftV(int info, int* rsreg, int* rtreg, int* rdreg, int* rstemp, int *rstemp = _allocMMXreg(-1, MMX_TEMP, 0); xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]); - xAND(eax, shift64?0x3f:0x1f); + xAND(eax, 0x3f); xMOVDZX(xRegisterMMX(*rstemp), eax); *rsreg = *rstemp; if( *rtreg != *rdreg ) xMOVQ(xRegisterMMX(*rdreg), xRegisterMMX(*rtreg)); - return 1; +#endif } -void recSetConstShiftV(int info, int* rsreg, int* rdreg, int* rstemp, int shift64) +void recSetConstShiftV(int info, int* rsreg, int* rdreg, int* rstemp) { +#if NO_MMX + _addNeededGPRtoXMMreg(_Rd_); + *rdreg = _allocGPRtoXMMreg(-1, _Rd_, MODE_WRITE); + + *rstemp = _allocTempXMMreg(XMMT_INT, -1); + + xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]); + xAND(eax, 0x3f); + xMOVDZX(xRegisterSSE(*rstemp), eax); + *rsreg = *rstemp; + + _flushConstReg(_Rt_); +#else _addNeededMMXreg(MMX_GPR+_Rd_); *rdreg = _allocMMXreg(-1, MMX_GPR+_Rd_, MODE_WRITE); SetMMXstate(); *rstemp = _allocMMXreg(-1, MMX_TEMP, 0); xMOV(eax, ptr[&cpuRegs.GPR.r[_Rs_].UL[0]]); - xAND(eax, shift64?0x3f:0x1f); + xAND(eax, 0x3f); xMOVDZX(xRegisterMMX(*rstemp), eax); *rsreg = *rstemp; _flushConstReg(_Rt_); +#endif } //// SLLV @@ -479,20 +581,45 @@ void recDSLLV_consts(int info) void recDSLLV_constt(int info) { int rsreg, rdreg, rstemp = -1; - recSetConstShiftV(info, &rsreg, &rdreg, &rstemp, 1); + recSetConstShiftV(info, &rsreg, &rdreg, &rstemp); +#if NO_MMX + xMOVDQA(xRegisterSSE(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]); + xPSLL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg)); + if( rstemp != -1 ) _freeXMMreg(rstemp); + + // flush lower 64 bits (as upper is wrong) + // The others possibility could be a read back of the upper 64 bits + // (better use of register but code will likely be flushed after anyway) + xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg)); + _deleteGPRtoXMMreg(_Rt_, 3); + _deleteGPRtoXMMreg(_Rd_, 3); +#else xMOVQ(xRegisterMMX(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]); xPSLL.Q(xRegisterMMX(rdreg), xRegisterMMX(rsreg)); if( rstemp != -1 ) _freeMMXreg(rstemp); +#endif } void recDSLLV_(int info) { int rsreg, rtreg, rdreg, rstemp = -1; - recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp, 1); + recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp); +#if NO_MMX + xPSLL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg)); + if( rstemp != -1 ) _freeXMMreg(rstemp); + + // flush lower 64 bits (as upper is wrong) + // The others possibility could be a read back of the upper 64 bits + // (better use of register but code will likely be flushed after anyway) + xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg)); + _deleteGPRtoXMMreg(_Rt_, 3); + _deleteGPRtoXMMreg(_Rd_, 3); +#else xPSLL.Q(xRegisterMMX(rdreg), xRegisterMMX(rsreg)); if( rstemp != -1 ) _freeMMXreg(rstemp); +#endif } EERECOMPILE_CODE0(DSLLV, XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED); @@ -513,20 +640,45 @@ void recDSRLV_consts(int info) void recDSRLV_constt(int info) { int rsreg, rdreg, rstemp = -1; - recSetConstShiftV(info, &rsreg, &rdreg, &rstemp, 1); + recSetConstShiftV(info, &rsreg, &rdreg, &rstemp); +#if NO_MMX + xMOVDQA(xRegisterSSE(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]); + xPSRL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg)); + if( rstemp != -1 ) _freeXMMreg(rstemp); + + // flush lower 64 bits (as upper is wrong) + // The others possibility could be a read back of the upper 64 bits + // (better use of register but code will likely be flushed after anyway) + xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg)); + _deleteGPRtoXMMreg(_Rt_, 3); + _deleteGPRtoXMMreg(_Rd_, 3); +#else xMOVQ(xRegisterMMX(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]); xPSRL.Q(xRegisterMMX(rdreg), xRegisterMMX(rsreg)); if( rstemp != -1 ) _freeMMXreg(rstemp); +#endif } void recDSRLV_(int info) { int rsreg, rtreg, rdreg, rstemp = -1; - recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp, 1); + recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp); +#if NO_MMX + xPSRL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg)); + if( rstemp != -1 ) _freeXMMreg(rstemp); + + // flush lower 64 bits (as upper is wrong) + // The others possibility could be a read back of the upper 64 bits + // (better use of register but code will likely be flushed after anyway) + xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg)); + _deleteGPRtoXMMreg(_Rt_, 3); + _deleteGPRtoXMMreg(_Rd_, 3); +#else xPSRL.Q(xRegisterMMX(rdreg), xRegisterMMX(rsreg)); if( rstemp != -1 ) _freeMMXreg(rstemp); +#endif } EERECOMPILE_CODE0(DSRLV, XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED); @@ -547,10 +699,45 @@ void recDSRAV_consts(int info) void recDSRAV_constt(int info) { int rsreg, rdreg, rstemp = -1, t0reg, t1reg; +#if NO_MMX + t0reg = _allocTempXMMreg(XMMT_INT, -1); + t1reg = _allocTempXMMreg(XMMT_INT, -1); + + recSetConstShiftV(info, &rsreg, &rdreg, &rstemp); + + xMOVDQA(xRegisterSSE(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]); + xPXOR(xRegisterSSE(t0reg), xRegisterSSE(t0reg)); + + // calc high bit + xMOVDQA(xRegisterSSE(t1reg), xRegisterSSE(rdreg)); + xPCMP.GTD(xRegisterSSE(t0reg), xRegisterSSE(rdreg)); + xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(t0reg), 0x55); + + // shift highest bit, 64 - eax + xMOV(eax, 64); + xMOVDZX(xRegisterSSE(t1reg), eax); + xPSUB.D(xRegisterSSE(t1reg), xRegisterSSE(rsreg)); + + // right logical shift + xPSRL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg)); + xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg)); // highest bits + + xPOR(xRegisterSSE(rdreg), xRegisterSSE(t0reg)); + + // flush lower 64 bits (as upper is wrong) + // The others possibility could be a read back of the upper 64 bits + // (better use of register but code will likely be flushed after anyway) + xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg)); + _deleteGPRtoXMMreg(_Rd_, 3); + + _freeXMMreg(t0reg); + _freeXMMreg(t1reg); + if( rstemp != -1 ) _freeXMMreg(rstemp); +#else t0reg = _allocMMXreg(-1, MMX_TEMP, 0); t1reg = _allocMMXreg(-1, MMX_TEMP, 0); - recSetConstShiftV(info, &rsreg, &rdreg, &rstemp, 1); + recSetConstShiftV(info, &rsreg, &rdreg, &rstemp); xMOVQ(xRegisterMMX(rdreg), ptr[&cpuRegs.GPR.r[_Rt_]]); xPXOR(xRegisterMMX(t0reg), xRegisterMMX(t0reg)); @@ -574,14 +761,49 @@ void recDSRAV_constt(int info) _freeMMXreg(t0reg); _freeMMXreg(t1reg); if( rstemp != -1 ) _freeMMXreg(rstemp); +#endif } void recDSRAV_(int info) { int rsreg, rtreg, rdreg, rstemp = -1, t0reg, t1reg; +#if NO_MMX + t0reg = _allocTempXMMreg(XMMT_INT, -1); + t1reg = _allocTempXMMreg(XMMT_INT, -1); + recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp); + + xPXOR(xRegisterSSE(t0reg), xRegisterSSE(t0reg)); + + // calc high bit + xMOVDQA(xRegisterSSE(t1reg), xRegisterSSE(rdreg)); + xPCMP.GTD(xRegisterSSE(t0reg), xRegisterSSE(rdreg)); + xPSHUF.D(xRegisterSSE(t0reg), xRegisterSSE(t0reg), 0x55); + + // shift highest bit, 64 - eax + xMOV(eax, 64); + xMOVDZX(xRegisterSSE(t1reg), eax); + xPSUB.D(xRegisterSSE(t1reg), xRegisterSSE(rsreg)); + + // right logical shift + xPSRL.Q(xRegisterSSE(rdreg), xRegisterSSE(rsreg)); + xPSLL.Q(xRegisterSSE(t0reg), xRegisterSSE(t1reg)); // highest bits + + xPOR(xRegisterSSE(rdreg), xRegisterSSE(t0reg)); + + // flush lower 64 bits (as upper is wrong) + // The others possibility could be a read back of the upper 64 bits + // (better use of register but code will likely be flushed after anyway) + xMOVL.PD(ptr64[&cpuRegs.GPR.r[ _Rd_ ].UD[ 0 ]] , xRegisterSSE(rdreg)); + _deleteGPRtoXMMreg(_Rt_, 3); + _deleteGPRtoXMMreg(_Rd_, 3); + + _freeXMMreg(t0reg); + _freeXMMreg(t1reg); + if( rstemp != -1 ) _freeXMMreg(rstemp); +#else t0reg = _allocMMXreg(-1, MMX_TEMP, 0); t1reg = _allocMMXreg(-1, MMX_TEMP, 0); - recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp, 1); + recSetShiftV(info, &rsreg, &rtreg, &rdreg, &rstemp); xPXOR(xRegisterMMX(t0reg), xRegisterMMX(t0reg)); @@ -604,6 +826,7 @@ void recDSRAV_(int info) _freeMMXreg(t0reg); _freeMMXreg(t1reg); if( rstemp != -1 ) _freeMMXreg(rstemp); +#endif } EERECOMPILE_CODE0(DSRAV, XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED);