core: purge sse2

This commit is contained in:
Gauvain 'GovanifY' Roussel-Tarbouriech 2021-03-27 09:18:02 +01:00 committed by lightningterror
parent 5509bfc003
commit 8a9ec4c706
10 changed files with 71 additions and 461 deletions

View File

@ -46,13 +46,13 @@ void Pcsx2App::DetectCpuAndUserMode()
x86caps.CountCores();
x86caps.SIMD_EstablishMXCSRmask();
if (!x86caps.hasStreamingSIMD2Extensions)
if (!x86caps.hasStreamingSIMD4Extensions)
{
// This code will probably never run if the binary was correctly compiled for SSE2
// SSE2 is required for any decent speed and is supported by more than decade old x86 CPUs
// This code will probably never run if the binary was correctly compiled for SSE4
// SSE4 is required for any decent speed and is supported by more than decade old x86 CPUs
throw Exception::HardwareDeficiency()
.SetDiagMsg(L"Critical Failure: SSE2 Extensions not available.")
.SetUserMsg(_("SSE2 extensions are not available. PCSX2 requires a cpu that supports the SSE2 instruction set."));
.SetDiagMsg(L"Critical Failure: SSE4 Extensions not available.")
.SetUserMsg(_("SSE4 extensions are not available. PCSX2 requires a cpu that supports the SSE4 instruction set."));
}
#endif

View File

@ -684,7 +684,6 @@ void Panels::PluginSelectorPanel::OnEnumComplete( wxCommandEvent& evt )
int index_avx2 = -1;
int index_sse4 = -1;
int index_sse2 = -1;
for( int i = 0; i < count; i++ )
{
@ -692,12 +691,10 @@ void Panels::PluginSelectorPanel::OnEnumComplete( wxCommandEvent& evt )
if( x86caps.hasAVX2 && str.Contains("AVX2") ) index_avx2 = i;
if( x86caps.hasStreamingSIMD4Extensions && str.Contains("SSE4") ) index_sse4 = i;
if( str.Contains("SSE2") ) index_sse2 = i;
}
if( index_avx2 >= 0 ) m_ComponentBoxes->Get(pid).SetSelection( index_avx2 );
else if( index_sse4 >= 0 ) m_ComponentBoxes->Get(pid).SetSelection( index_sse4 );
else if( index_sse2 >= 0 ) m_ComponentBoxes->Get(pid).SetSelection( index_sse2 );
else m_ComponentBoxes->Get(pid).SetSelection( 0 );
}
else

View File

@ -215,17 +215,9 @@ void recPMTHL()
int info = eeRecompileCodeXMM( XMMINFO_READS|XMMINFO_READLO|XMMINFO_READHI|XMMINFO_WRITELO|XMMINFO_WRITEHI );
if ( x86caps.hasStreamingSIMD4Extensions ) {
xBLEND.PS(xRegisterSSE(EEREC_LO), xRegisterSSE(EEREC_S), 0x5);
xSHUF.PS(xRegisterSSE(EEREC_HI), xRegisterSSE(EEREC_S), 0xdd);
xSHUF.PS(xRegisterSSE(EEREC_HI), xRegisterSSE(EEREC_HI), 0x72);
}
else {
xSHUF.PS(xRegisterSSE(EEREC_LO), xRegisterSSE(EEREC_S), 0x8d);
xSHUF.PS(xRegisterSSE(EEREC_HI), xRegisterSSE(EEREC_S), 0xdd);
xSHUF.PS(xRegisterSSE(EEREC_LO), xRegisterSSE(EEREC_LO), 0x72);
xSHUF.PS(xRegisterSSE(EEREC_HI), xRegisterSSE(EEREC_HI), 0x72);
}
_clearNeededXMMregs();
}
@ -400,7 +392,6 @@ void recPMAXW()
EE::Profiler.EmitOp(eeOpcode::PMAXW);
int info = eeRecompileCodeXMM( XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED );
if ( x86caps.hasStreamingSIMD4Extensions ) {
if( EEREC_S == EEREC_T ) xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else if( EEREC_D == EEREC_S ) xPMAX.SD(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
else if ( EEREC_D == EEREC_T ) xPMAX.SD(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
@ -408,40 +399,6 @@ void recPMAXW()
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPMAX.SD(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
}
}
else {
int t0reg;
if( EEREC_S == EEREC_T ) {
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
}
else {
t0reg = _allocTempXMMreg(XMMT_INT, -1);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
xPCMP.GTD(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
if( EEREC_D == EEREC_S ) {
xPAND(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
xPANDN(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
}
else if( EEREC_D == EEREC_T ) {
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
xMOVDQA(xRegisterSSE(t1reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPAND(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
xPANDN(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
_freeXMMreg(t1reg);
}
else {
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPAND(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
xPANDN(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
}
xPOR(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
}
}
_clearNeededXMMregs();
}
@ -1173,18 +1130,7 @@ void recPABSW() //needs clamping
xPCMP.EQD(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSLL.D(xRegisterSSE(t0reg), 31);
xPCMP.EQD(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T)); //0xffffffff if equal to 0x80000000
if( x86caps.hasSupplementalStreamingSIMD3Extensions ) {
xPABS.D(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T)); //0x80000000 -> 0x80000000
}
else {
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
xMOVDQA(xRegisterSSE(t1reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPSRA.D(xRegisterSSE(t1reg), 31);
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(t1reg));
xPSUB.D(xRegisterSSE(EEREC_D), xRegisterSSE(t1reg)); //0x80000000 -> 0x80000000
_freeXMMreg(t1reg);
}
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg)); //0x80000000 -> 0x7fffffff
_freeXMMreg(t0reg);
_clearNeededXMMregs();
@ -1203,18 +1149,7 @@ void recPABSH()
xPCMP.EQW(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xPSLL.W(xRegisterSSE(t0reg), 15);
xPCMP.EQW(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T)); //0xffff if equal to 0x8000
if( x86caps.hasSupplementalStreamingSIMD3Extensions ) {
xPABS.W(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T)); //0x8000 -> 0x8000
}
else {
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
xMOVDQA(xRegisterSSE(t1reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
xPSRA.W(xRegisterSSE(t1reg), 15);
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(t1reg));
xPSUB.W(xRegisterSSE(EEREC_D), xRegisterSSE(t1reg)); //0x8000 -> 0x8000
_freeXMMreg(t1reg);
}
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg)); //0x8000 -> 0x7fff
_freeXMMreg(t0reg);
_clearNeededXMMregs();
@ -1228,7 +1163,6 @@ void recPMINW()
EE::Profiler.EmitOp(eeOpcode::PMINW);
int info = eeRecompileCodeXMM( XMMINFO_READS|XMMINFO_READT|XMMINFO_WRITED );
if ( x86caps.hasStreamingSIMD4Extensions ) {
if( EEREC_S == EEREC_T ) xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else if( EEREC_D == EEREC_S ) xPMIN.SD(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
else if ( EEREC_D == EEREC_T ) xPMIN.SD(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
@ -1236,40 +1170,6 @@ void recPMINW()
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPMIN.SD(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
}
}
else {
int t0reg;
if( EEREC_S == EEREC_T ) {
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
}
else {
t0reg = _allocTempXMMreg(XMMT_INT, -1);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xPCMP.GTD(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
if( EEREC_D == EEREC_S ) {
xPAND(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
xPANDN(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
}
else if( EEREC_D == EEREC_T ) {
int t1reg = _allocTempXMMreg(XMMT_INT, -1);
xMOVDQA(xRegisterSSE(t1reg), xRegisterSSE(EEREC_T));
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPAND(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
xPANDN(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
_freeXMMreg(t1reg);
}
else {
xMOVDQA(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xPAND(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
xPANDN(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
}
xPOR(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
}
}
_clearNeededXMMregs();
}
@ -1718,12 +1618,6 @@ void recPMADDW()
{
EE::Profiler.EmitOp(eeOpcode::PMADDW);
if( !x86caps.hasStreamingSIMD4Extensions ) {
_deleteEEreg(_Rd_, 0);
recCall(Interp::PMADDW);
return;
}
int info = eeRecompileCodeXMM( (((_Rs_)&&(_Rt_))?XMMINFO_READS:0)|(((_Rs_)&&(_Rt_))?XMMINFO_READT:0)|(_Rd_?XMMINFO_WRITED:0)|XMMINFO_WRITELO|XMMINFO_WRITEHI|XMMINFO_READLO|XMMINFO_READHI );
xSHUF.PS(xRegisterSSE(EEREC_LO), xRegisterSSE(EEREC_HI), 0x88);
xPSHUF.D(xRegisterSSE(EEREC_LO), xRegisterSSE(EEREC_LO), 0xd8); // LO = {LO[0], HI[0], LO[2], HI[2]}
@ -1775,19 +1669,9 @@ void recPSLLVW()
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
}
else {
if ( x86caps.hasStreamingSIMD4Extensions ) {
xPSHUF.D(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T), 0x88);
xPMOVSX.DQ(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
}
else {
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
xPSHUF.D(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T), 0x88);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_D));
xPSRA.D(xRegisterSSE(t0reg), 31);
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
}
}
}
else if( _Rt_ == 0 ) {
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
@ -1813,16 +1697,8 @@ void recPSLLVW()
xPSLL.D(xRegisterSSE(t1reg), xRegisterSSE(t0reg));
// merge & sign extend
if ( x86caps.hasStreamingSIMD4Extensions ) {
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t1reg));
xPMOVSX.DQ(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
}
else {
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t1reg));
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_D));
xPSRA.D(xRegisterSSE(t0reg), 31); // get the signs
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
}
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
@ -1843,19 +1719,9 @@ void recPSRLVW()
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
}
else {
if ( x86caps.hasStreamingSIMD4Extensions ) {
xPSHUF.D(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T), 0x88);
xPMOVSX.DQ(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
}
else {
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
xPSHUF.D(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T), 0x88);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_D));
xPSRA.D(xRegisterSSE(t0reg), 31);
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
}
}
}
else if( _Rt_ == 0 ) {
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
@ -1881,16 +1747,8 @@ void recPSRLVW()
xPSRL.D(xRegisterSSE(t1reg), xRegisterSSE(t0reg));
// merge & sign extend
if ( x86caps.hasStreamingSIMD4Extensions ) {
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t1reg));
xPMOVSX.DQ(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
}
else {
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t1reg));
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_D));
xPSRA.D(xRegisterSSE(t0reg), 31); // get the signs
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
}
_freeXMMreg(t0reg);
_freeXMMreg(t1reg);
@ -1903,11 +1761,6 @@ void recPMSUBW()
{
EE::Profiler.EmitOp(eeOpcode::PMSUBW);
if( !x86caps.hasStreamingSIMD4Extensions ) {
_deleteEEreg(_Rd_, 0);
recCall(Interp::PMSUBW);
return;
}
int info = eeRecompileCodeXMM( (((_Rs_)&&(_Rt_))?XMMINFO_READS:0)|(((_Rs_)&&(_Rt_))?XMMINFO_READT:0)|(_Rd_?XMMINFO_WRITED:0)|XMMINFO_WRITELO|XMMINFO_WRITEHI|XMMINFO_READLO|XMMINFO_READHI );
xSHUF.PS(xRegisterSSE(EEREC_LO), xRegisterSSE(EEREC_HI), 0x88);
xPSHUF.D(xRegisterSSE(EEREC_LO), xRegisterSSE(EEREC_LO), 0xd8); // LO = {LO[0], HI[0], LO[2], HI[2]}
@ -1957,11 +1810,6 @@ void recPMULTW()
{
EE::Profiler.EmitOp(eeOpcode::PMULTW);
if( !x86caps.hasStreamingSIMD4Extensions ) {
_deleteEEreg(_Rd_, 0);
recCall(Interp::PMULTW);
return;
}
int info = eeRecompileCodeXMM( (((_Rs_)&&(_Rt_))?XMMINFO_READS:0)|(((_Rs_)&&(_Rt_))?XMMINFO_READT:0)|(_Rd_?XMMINFO_WRITED:0)|XMMINFO_WRITELO|XMMINFO_WRITEHI );
if( !_Rs_ || !_Rt_ ) {
if( _Rd_ ) xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
@ -2455,19 +2303,9 @@ void recPSRAVW()
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
}
else {
if ( x86caps.hasStreamingSIMD4Extensions ) {
xPSHUF.D(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T), 0x88);
xPMOVSX.DQ(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
}
else {
int t0reg = _allocTempXMMreg(XMMT_INT, -1);
xPSHUF.D(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T), 0x88);
xMOVDQA(xRegisterSSE(t0reg), xRegisterSSE(EEREC_D));
xPSRA.D(xRegisterSSE(t0reg), 31);
xPUNPCK.LDQ(xRegisterSSE(EEREC_D), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
}
}
}
else if( _Rt_ == 0 ) {
xPXOR(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));

View File

@ -508,8 +508,8 @@ static void recReserve()
{
// Hardware Requirements Check...
if ( !x86caps.hasStreamingSIMD2Extensions )
recThrowHardwareDeficiency( L"SSE2" );
if ( !x86caps.hasStreamingSIMD4Extensions )
recThrowHardwareDeficiency( L"SSE4" );
recReserveCache();
}

View File

@ -47,7 +47,7 @@ void mVUreserveCache(microVU& mVU) {
// Only run this once per VU! ;)
void mVUinit(microVU& mVU, uint vuIndex) {
if(!x86caps.hasStreamingSIMD2Extensions) mVUthrowHardwareDeficiency( L"SSE2", vuIndex );
if(!x86caps.hasStreamingSIMD4Extensions) mVUthrowHardwareDeficiency( L"SSE4", vuIndex );
memzero(mVU.prog);

View File

@ -166,13 +166,7 @@ __fi void getQreg(const xmm& reg, int qInstance)
__ri void writeQreg(const xmm& reg, int qInstance)
{
if (qInstance) {
if (!x86caps.hasStreamingSIMD4Extensions) {
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
xMOVSS(xmmPQ, reg);
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
}
else xINSERTPS(xmmPQ, reg, _MM_MK_INSERTPS_NDX(0, 1, 0));
}
if (qInstance)
xINSERTPS(xmmPQ, reg, _MM_MK_INSERTPS_NDX(0, 1, 0));
else xMOVSS(xmmPQ, reg);
}

View File

@ -56,34 +56,11 @@ void mVUclamp1(const xmm& reg, const xmm& regT1, int xyzw, bool bClampE = 0) {
// so we just use a temporary mem location for our backup for now... (non-sse4 version only)
void mVUclamp2(microVU& mVU, const xmm& reg, const xmm& regT1in, int xyzw, bool bClampE = 0) {
if ((!clampE && CHECK_VU_SIGN_OVERFLOW) || (clampE && bClampE && CHECK_VU_SIGN_OVERFLOW)) {
if (x86caps.hasStreamingSIMD4Extensions) {
int i = (xyzw==1||xyzw==2||xyzw==4||xyzw==8) ? 0: 1;
xPMIN.SD(reg, ptr128[&sse4_maxvals[i][0]]);
xPMIN.UD(reg, ptr128[&sse4_minvals[i][0]]);
return;
}
//const xmm& regT1 = regT1b ? mVU.regAlloc->allocReg() : regT1in;
const xmm& regT1 = regT1in.IsEmpty() ? xmm((reg.Id + 1) % 8) : regT1in;
if (regT1 != regT1in) xMOVAPS(ptr128[mVU.xmmCTemp], regT1);
switch (xyzw) {
case 1: case 2: case 4: case 8:
xMOVAPS(regT1, reg);
xAND.PS(regT1, ptr128[mVUglob.signbit]);
xMIN.SS(reg, ptr128[mVUglob.maxvals]);
xMAX.SS(reg, ptr128[mVUglob.minvals]);
xOR.PS (reg, regT1);
break;
default:
xMOVAPS(regT1, reg);
xAND.PS(regT1, ptr128[mVUglob.signbit]);
xMIN.PS(reg, ptr128[mVUglob.maxvals]);
xMAX.PS(reg, ptr128[mVUglob.minvals]);
xOR.PS (reg, regT1);
break;
}
//if (regT1 != regT1in) mVU.regAlloc->clearNeeded(regT1);
if (regT1 != regT1in) xMOVAPS(regT1, ptr128[mVU.xmmCTemp]);
}
else mVUclamp1(reg, regT1in, xyzw, bClampE);
}

View File

@ -28,11 +28,7 @@ static __fi void testZero(const xmm& xmmReg, const xmm& xmmTemp, const x32& gprT
{
xXOR.PS(xmmTemp, xmmTemp);
xCMPEQ.SS(xmmTemp, xmmReg);
if (!x86caps.hasStreamingSIMD4Extensions) {
xMOVMSKPS(gprTemp, xmmTemp);
xTEST(gprTemp, 1);
}
else xPTEST(xmmTemp, xmmTemp);
xPTEST(xmmTemp, xmmTemp);
}
// Test if Vector is Negative (Set Flags and Makes Positive)
@ -298,18 +294,8 @@ mVUop(mVU_EEXP) {
// sumXYZ(): PQ.x = x ^ 2 + y ^ 2 + z ^ 2
static __fi void mVU_sumXYZ(mV, const xmm& PQ, const xmm& Fs) {
if (x86caps.hasStreamingSIMD4Extensions) {
xDP.PS(Fs, Fs, 0x71);
xMOVSS(PQ, Fs);
}
else {
SSE_MULPS(mVU, Fs, Fs); // wzyx ^ 2
xMOVSS (PQ, Fs); // x ^ 2
xPSHUF.D (Fs, Fs, 0xe1); // wzyx -> wzxy
SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2
xPSHUF.D (Fs, Fs, 0xd2); // wzxy -> wxyz
SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2 + z ^ 2
}
}
mVUop(mVU_ELENG) {

View File

@ -59,72 +59,29 @@ void mVUsaveReg(const xmm& reg, xAddressVoid ptr, int xyzw, bool modXYZW)
return;*/
switch ( xyzw ) {
case 5: if (x86caps.hasStreamingSIMD4Extensions) {
xEXTRACTPS(ptr32[ptr+4], reg, 1);
case 5: xEXTRACTPS(ptr32[ptr+4], reg, 1);
xEXTRACTPS(ptr32[ptr+12], reg, 3);
}
else {
xPSHUF.D(reg, reg, 0xe1); //WZXY
xMOVSS(ptr32[ptr+4], reg);
xPSHUF.D(reg, reg, 0xff); //WWWW
xMOVSS(ptr32[ptr+12], reg);
}
break; // YW
case 6: xPSHUF.D(reg, reg, 0xc9);
xMOVL.PS(ptr64[ptr+4], reg);
break; // YZ
case 7: if (x86caps.hasStreamingSIMD4Extensions) {
xMOVH.PS(ptr64[ptr+8], reg);
case 7: xMOVH.PS(ptr64[ptr+8], reg);
xEXTRACTPS(ptr32[ptr+4], reg, 1);
}
else {
xPSHUF.D(reg, reg, 0x93); //ZYXW
xMOVH.PS(ptr64[ptr+4], reg);
xMOVSS(ptr32[ptr+12], reg);
}
break; // YZW
case 9: if (x86caps.hasStreamingSIMD4Extensions) {
xMOVSS(ptr32[ptr], reg);
case 9: xMOVSS(ptr32[ptr], reg);
xEXTRACTPS(ptr32[ptr+12], reg, 3);
}
else {
xMOVSS(ptr32[ptr], reg);
xPSHUF.D(reg, reg, 0xff); //WWWW
xMOVSS(ptr32[ptr+12], reg);
}
break; // XW
case 10: if (x86caps.hasStreamingSIMD4Extensions) {
xMOVSS(ptr32[ptr], reg);
case 10: xMOVSS(ptr32[ptr], reg);
xEXTRACTPS(ptr32[ptr+8], reg, 2);
}
else {
xMOVSS(ptr32[ptr], reg);
xMOVHL.PS(reg, reg);
xMOVSS(ptr32[ptr+8], reg);
}
break; //XZ
case 11: xMOVSS(ptr32[ptr], reg);
xMOVH.PS(ptr64[ptr+8], reg);
break; //XZW
case 13: if (x86caps.hasStreamingSIMD4Extensions) {
xMOVL.PS(ptr64[ptr], reg);
case 13: xMOVL.PS(ptr64[ptr], reg);
xEXTRACTPS(ptr32[ptr+12], reg, 3);
}
else {
xPSHUF.D(reg, reg, 0x4b); //YXZW
xMOVH.PS(ptr64[ptr], reg);
xMOVSS(ptr32[ptr+12], reg);
}
break; // XYW
case 14: if (x86caps.hasStreamingSIMD4Extensions) {
xMOVL.PS(ptr64[ptr], reg);
case 14: xMOVL.PS(ptr64[ptr], reg);
xEXTRACTPS(ptr32[ptr+8], reg, 2);
}
else {
xMOVL.PS(ptr64[ptr], reg);
xMOVHL.PS(reg, reg);
xMOVSS(ptr32[ptr+8], reg);
}
break; // XYZ
case 4: if (!modXYZW) mVUunpack_xyzw(reg, reg, 1);
xMOVSS(ptr32[ptr+4], reg);
@ -146,8 +103,14 @@ void mVUsaveReg(const xmm& reg, xAddressVoid ptr, int xyzw, bool modXYZW)
void mVUmergeRegs(const xmm& dest, const xmm& src, int xyzw, bool modXYZW)
{
xyzw &= 0xf;
if ( (dest != src) && (xyzw != 0) ) {
if (x86caps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf)) {
if ( (dest != src) && (xyzw != 0) )
{
if (xyzw == 0x8)
xMOVSS(dest, src);
else if (xyzw == 0xf)
xMOVAPS(dest, src);
else
{
if (modXYZW) {
if (xyzw == 1) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 3, 0)); return; }
else if (xyzw == 2) { xINSERTPS(dest, src, _MM_MK_INSERTPS_NDX(0, 2, 0)); return; }
@ -156,56 +119,6 @@ void mVUmergeRegs(const xmm& dest, const xmm& src, int xyzw, bool modXYZW)
xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
xBLEND.PS(dest, src, xyzw);
}
else {
switch (xyzw) {
case 1: if (modXYZW) mVUunpack_xyzw(src, src, 0);
xMOVHL.PS(src, dest); // src = Sw Sz Dw Dz
xSHUF.PS(dest, src, 0xc4); // 11 00 01 00
break;
case 2: if (modXYZW) mVUunpack_xyzw(src, src, 0);
xMOVHL.PS(src, dest);
xSHUF.PS(dest, src, 0x64);
break;
case 3: xSHUF.PS(dest, src, 0xe4);
break;
case 4: if (modXYZW) mVUunpack_xyzw(src, src, 0);
xMOVSS(src, dest);
xMOVSD(dest, src);
break;
case 5: xSHUF.PS(dest, src, 0xd8);
xPSHUF.D(dest, dest, 0xd8);
break;
case 6: xSHUF.PS(dest, src, 0x9c);
xPSHUF.D(dest, dest, 0x78);
break;
case 7: xMOVSS(src, dest);
xMOVAPS(dest, src);
break;
case 8: xMOVSS(dest, src);
break;
case 9: xSHUF.PS(dest, src, 0xc9);
xPSHUF.D(dest, dest, 0xd2);
break;
case 10: xSHUF.PS(dest, src, 0x8d);
xPSHUF.D(dest, dest, 0x72);
break;
case 11: xMOVSS(dest, src);
xSHUF.PS(dest, src, 0xe4);
break;
case 12: xMOVSD(dest, src);
break;
case 13: xMOVHL.PS(dest, src);
xSHUF.PS(src, dest, 0x64);
xMOVAPS(dest, src);
break;
case 14: xMOVHL.PS(dest, src);
xSHUF.PS(src, dest, 0xc4);
xMOVAPS(dest, src);
break;
default: xMOVAPS(dest, src);
break;
}
}
}
}

View File

@ -35,23 +35,7 @@ static RecompiledCodeReserve* nVifUpkExec = NULL;
// Merges xmm vectors without modifying source reg
void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw) {
if (x86caps.hasStreamingSIMD4Extensions || (xyzw==15)
|| (xyzw==12) || (xyzw==11) || (xyzw==8) || (xyzw==3)) {
mVUmergeRegs(dest, src, xyzw);
}
else
{
if(temp != src) xMOVAPS(temp, src); //Sometimes we don't care if the source is modified and is temp reg.
if(dest == temp)
{
//VIF can sent the temp directory as the source and destination, just need to clear the ones we dont want in which case.
if(!(xyzw & 0x1)) xAND.PS( dest, ptr128[SSEXYZWMask[0]]);
if(!(xyzw & 0x2)) xAND.PS( dest, ptr128[SSEXYZWMask[1]]);
if(!(xyzw & 0x4)) xAND.PS( dest, ptr128[SSEXYZWMask[2]]);
if(!(xyzw & 0x8)) xAND.PS( dest, ptr128[SSEXYZWMask[3]]);
}
else mVUmergeRegs(dest, temp, xyzw);
}
}
// =====================================================================================================
@ -113,16 +97,6 @@ void VifUnpackSSE_Base::xUPK_S_32() const {
void VifUnpackSSE_Base::xUPK_S_16() const {
if (!x86caps.hasStreamingSIMD4Extensions)
{
xMOV16 (workReg, ptr32[srcIndirect]);
xPUNPCK.LWD(workReg, workReg);
xShiftR (workReg, 16);
xPSHUF.D (destReg, workReg, _v0);
return;
}
switch(UnpkLoopIteration)
{
case 0:
@ -144,17 +118,6 @@ void VifUnpackSSE_Base::xUPK_S_16() const {
void VifUnpackSSE_Base::xUPK_S_8() const {
if (!x86caps.hasStreamingSIMD4Extensions)
{
xMOV8 (workReg, ptr32[srcIndirect]);
xPUNPCK.LBW(workReg, workReg);
xPUNPCK.LWD(workReg, workReg);
xShiftR (workReg, 24);
xPSHUF.D (destReg, workReg, _v0);
return;
}
switch(UnpkLoopIteration)
{
case 0:
@ -199,18 +162,8 @@ void VifUnpackSSE_Base::xUPK_V2_32() const {
void VifUnpackSSE_Base::xUPK_V2_16() const {
if(UnpkLoopIteration == 0)
{
if (x86caps.hasStreamingSIMD4Extensions)
{
xPMOVXX16 (workReg);
}
else
{
xMOV64 (workReg, ptr64[srcIndirect]);
xPUNPCK.LWD(workReg, workReg);
xShiftR (workReg, 16);
}
xPSHUF.D (destReg, workReg, 0x44); //v1v0v1v0
}
else
@ -223,19 +176,9 @@ void VifUnpackSSE_Base::xUPK_V2_16() const {
void VifUnpackSSE_Base::xUPK_V2_8() const {
if(UnpkLoopIteration == 0 || !x86caps.hasStreamingSIMD4Extensions)
{
if (x86caps.hasStreamingSIMD4Extensions)
if(UnpkLoopIteration == 0)
{
xPMOVXX8 (workReg);
}
else
{
xMOV16 (workReg, ptr32[srcIndirect]);
xPUNPCK.LBW(workReg, workReg);
xPUNPCK.LWD(workReg, workReg);
xShiftR (workReg, 24);
}
xPSHUF.D (destReg, workReg, 0x44); //v1v0v1v0
}
else
@ -254,16 +197,7 @@ void VifUnpackSSE_Base::xUPK_V3_32() const {
void VifUnpackSSE_Base::xUPK_V3_16() const {
if (x86caps.hasStreamingSIMD4Extensions)
{
xPMOVXX16 (destReg);
}
else
{
xMOV64 (destReg, ptr32[srcIndirect]);
xPUNPCK.LWD(destReg, destReg);
xShiftR (destReg, 16);
}
//With V3-16, it takes the first vector from the next position as the W vector
//However - IF the end of this iteration of the unpack falls on a quadword boundary, W becomes 0
@ -278,17 +212,7 @@ void VifUnpackSSE_Base::xUPK_V3_16() const {
void VifUnpackSSE_Base::xUPK_V3_8() const {
if (x86caps.hasStreamingSIMD4Extensions)
{
xPMOVXX8 (destReg);
}
else
{
xMOV32 (destReg, ptr32[srcIndirect]);
xPUNPCK.LBW(destReg, destReg);
xPUNPCK.LWD(destReg, destReg);
xShiftR (destReg, 24);
}
if (UnpkLoopIteration != IsAligned)
xAND.PS(destReg, ptr128[SSEXYZWMask[0]]);
}
@ -300,31 +224,12 @@ void VifUnpackSSE_Base::xUPK_V4_32() const {
void VifUnpackSSE_Base::xUPK_V4_16() const {
if (x86caps.hasStreamingSIMD4Extensions)
{
xPMOVXX16 (destReg);
}
else
{
xMOV64 (destReg, ptr32[srcIndirect]);
xPUNPCK.LWD(destReg, destReg);
xShiftR (destReg, 16);
}
}
void VifUnpackSSE_Base::xUPK_V4_8() const {
if (x86caps.hasStreamingSIMD4Extensions)
{
xPMOVXX8 (destReg);
}
else
{
xMOV32 (destReg, ptr32[srcIndirect]);
xPUNPCK.LBW(destReg, destReg);
xPUNPCK.LWD(destReg, destReg);
xShiftR (destReg, 24);
}
}
void VifUnpackSSE_Base::xUPK_V4_5() const {