diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index 374e9603f5..7693236a62 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -1340,41 +1340,56 @@ void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a) } -void XEmitter::WriteSSEOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes) +void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) { - if (size == 64 && packed) - Write8(0x66); //this time, override goes upwards - if (!packed) - Write8(size == 64 ? 0xF2 : 0xF3); + if (opPrefix) + Write8(opPrefix); arg.operandReg = regOp; arg.WriteRex(this, 0, 0); Write8(0x0F); - if (sseOp > 0xFF) - Write8((sseOp >> 8) & 0xFF); - Write8(sseOp & 0xFF); + if (op > 0xFF) + Write8((op >> 8) & 0xFF); + Write8(op & 0xFF); arg.WriteRest(this, extrabytes); } -void XEmitter::WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes) +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) { - WriteAVXOp(size, sseOp, packed, regOp, X64Reg::INVALID_REG, arg, extrabytes); + WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes); } -void XEmitter::WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +static int GetVEXmmmmm(u16 op) +{ + // Currently, only 0x38 and 0x3A are used as secondary escape byte. + if ((op >> 8) == 0x3A) + return 3; + else if ((op >> 8) == 0x38) + return 2; + else + return 1; +} + +static int GetVEXpp(u8 opPrefix) +{ + if (opPrefix == 0x66) + return 1; + else if (opPrefix == 0xF3) + return 2; + else if (opPrefix == 0xF2) + return 3; + else + return 0; +} + +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) { if (!cpu_info.bAVX) PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer."); - // Currently, only 0x38 and 0x3A are used as secondary escape byte. - int mmmmm; - if ((sseOp >> 8) == 0x3A) - mmmmm = 3; - else if ((sseOp >> 8) == 0x38) - mmmmm = 2; - else - mmmmm = 1; + int mmmmm = GetVEXmmmmm(op); + int pp = GetVEXpp(opPrefix); // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here - arg.WriteVex(this, regOp1, regOp2, 0, (packed << 1) | (size == 64), mmmmm); - Write8(sseOp & 0xFF); + arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm); + Write8(op & 0xFF); arg.WriteRest(this, extrabytes, regOp1); } @@ -1383,21 +1398,8 @@ void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg r { if (size != 32 && size != 64) PanicAlert("VEX GPR instructions only support 32-bit and 64-bit modes!"); - int mmmmm, pp; - if ((op >> 8) == 0x3A) - mmmmm = 3; - else if ((op >> 8) == 0x38) - mmmmm = 2; - else - mmmmm = 1; - if (opPrefix == 0x66) - pp = 1; - else if (opPrefix == 0xF3) - pp = 2; - else if (opPrefix == 0xF2) - pp = 3; - else - pp = 0; + int mmmmm = GetVEXmmmmm(op); + int pp = GetVEXpp(opPrefix); arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, size == 64); Write8(op & 0xFF); arg.WriteRest(this, extrabytes, regOp1); @@ -1419,8 +1421,8 @@ void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes); } -void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);} -void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);} +void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6E, dest, arg, 0);} +void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(0x66, 0x7E, src, arg, 0);} void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) { @@ -1473,123 +1475,123 @@ void XEmitter::WriteMXCSR(OpArg arg, int ext) void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);} void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);} -void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVNTDQ, true, regOp, arg);} -void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVNTP, true, regOp, arg);} -void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVNTP, true, regOp, arg);} +void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);} +void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);} +void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);} -void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseADD, false, regOp, arg);} -void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseADD, false, regOp, arg);} -void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSUB, false, regOp, arg);} -void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSUB, false, regOp, arg);} -void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(32, sseCMP, false, regOp, arg,1); Write8(compare);} -void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(64, sseCMP, false, regOp, arg,1); Write8(compare);} -void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMUL, false, regOp, arg);} -void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMUL, false, regOp, arg);} -void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseDIV, false, regOp, arg);} -void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseDIV, false, regOp, arg);} -void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMIN, false, regOp, arg);} -void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMIN, false, regOp, arg);} -void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMAX, false, regOp, arg);} -void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMAX, false, regOp, arg);} -void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSQRT, false, regOp, arg);} -void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSQRT, false, regOp, arg);} -void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, false, regOp, arg);} +void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseADD, regOp, arg);} +void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseADD, regOp, arg);} +void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSUB, regOp, arg);} +void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSUB, regOp, arg);} +void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMUL, regOp, arg);} +void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMUL, regOp, arg);} +void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseDIV, regOp, arg);} +void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseDIV, regOp, arg);} +void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMIN, regOp, arg);} +void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMIN, regOp, arg);} +void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMAX, regOp, arg);} +void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMAX, regOp, arg);} +void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSQRT, regOp, arg);} +void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSQRT, regOp, arg);} +void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);} -void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseADD, true, regOp, arg);} -void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseADD, true, regOp, arg);} -void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSUB, true, regOp, arg);} -void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSUB, true, regOp, arg);} -void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(32, sseCMP, true, regOp, arg,1); Write8(compare);} -void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(64, sseCMP, true, regOp, arg,1); Write8(compare);} -void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseAND, true, regOp, arg);} -void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseAND, true, regOp, arg);} -void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseANDN, true, regOp, arg);} -void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseANDN, true, regOp, arg);} -void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseOR, true, regOp, arg);} -void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseOR, true, regOp, arg);} -void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseXOR, true, regOp, arg);} -void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseXOR, true, regOp, arg);} -void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMUL, true, regOp, arg);} -void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMUL, true, regOp, arg);} -void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseDIV, true, regOp, arg);} -void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseDIV, true, regOp, arg);} -void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMIN, true, regOp, arg);} -void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMIN, true, regOp, arg);} -void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMAX, true, regOp, arg);} -void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMAX, true, regOp, arg);} -void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSQRT, true, regOp, arg);} -void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSQRT, true, regOp, arg);} -void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, true, regOp, arg);} -void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(32, sseSHUF, true, regOp, arg,1); Write8(shuffle);} -void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, sseSHUF, true, regOp, arg,1); Write8(shuffle);} +void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseADD, regOp, arg);} +void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseADD, regOp, arg);} +void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSUB, regOp, arg);} +void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSUB, regOp, arg);} +void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseAND, regOp, arg);} +void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseAND, regOp, arg);} +void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseANDN, regOp, arg);} +void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseANDN, regOp, arg);} +void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseOR, regOp, arg);} +void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseOR, regOp, arg);} +void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseXOR, regOp, arg);} +void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseXOR, regOp, arg);} +void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMUL, regOp, arg);} +void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMUL, regOp, arg);} +void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseDIV, regOp, arg);} +void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseDIV, regOp, arg);} +void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMIN, regOp, arg);} +void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMIN, regOp, arg);} +void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMAX, regOp, arg);} +void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMAX, regOp, arg);} +void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSQRT, regOp, arg);} +void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSQRT, regOp, arg);} +void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);} +void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);} +void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);} -void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseCOMIS, true, regOp, arg);} //weird that these should be packed -void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseCOMIS, true, regOp, arg);} //ordered -void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseUCOMIS, true, regOp, arg);} //unordered -void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseUCOMIS, true, regOp, arg);} +void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed +void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered +void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered +void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);} -void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVAPfromRM, true, regOp, arg);} -void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVAPfromRM, true, regOp, arg);} -void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVAPtoRM, true, regOp, arg);} -void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVAPtoRM, true, regOp, arg);} +void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);} +void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);} +void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);} +void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);} -void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVUPfromRM, true, regOp, arg);} -void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVUPfromRM, true, regOp, arg);} -void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVUPtoRM, true, regOp, arg);} -void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVUPtoRM, true, regOp, arg);} +void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);} +void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);} -void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVUPfromRM, false, regOp, arg);} -void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVUPfromRM, false, regOp, arg);} -void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVUPtoRM, false, regOp, arg);} -void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVUPtoRM, false, regOp, arg);} +void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);} +void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);} -void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVLPDfromRM, false, regOp, arg);} -void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVHPDfromRM, false, regOp, arg);} -void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVLPDtoRM, false, regOp, arg);} -void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVHPDtoRM, false, regOp, arg);} +void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVLPDfromRM, regOp, arg);} +void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVHPDfromRM, regOp, arg);} +void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVLPDtoRM, regOp, arg);} +void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVHPDtoRM, regOp, arg);} -void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(32, sseMOVHLPS, true, regOp1, R(regOp2));} -void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(32, sseMOVLHPS, true, regOp1, R(regOp2));} +void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));} +void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));} -void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, true, regOp, arg);} -void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, true, regOp, arg);} +void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);} +void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);} -void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, false, regOp, arg);} -void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, false, regOp, arg);} -void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2D, false, regOp, arg);} -void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x2D, false, regOp, arg);} -void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2A, false, regOp, arg);} -void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x2A, false, regOp, arg);} +void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);} +void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);} +void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);} +void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);} +void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);} +void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);} -void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0xE6, false, regOp, arg);} -void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5B, true, regOp, arg);} -void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0xE6, false, regOp, arg);} -void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5B, true, regOp, arg);} +void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);} +void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);} +void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);} +void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);} -void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2C, false, regOp, arg);} -void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x2C, false, regOp, arg);} -void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5B, false, regOp, arg);} -void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0xE6, true, regOp, arg);} +void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);} +void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);} +void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);} +void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);} -void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(64, sseMASKMOVDQU, true, dest, R(src));} +void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));} -void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x50, true, dest, arg);} -void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x50, true, dest, arg);} +void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);} +void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);} -void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(64, sseLDDQU, false, dest, arg);} // For integer data only +void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only // THESE TWO ARE UNTESTED. -void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x14, true, dest, arg);} -void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x15, true, dest, arg);} +void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);} +void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);} -void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x14, true, dest, arg);} -void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x15, true, dest, arg);} +void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);} +void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);} void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg) { if (cpu_info.bSSE3) { - WriteSSEOp(64, 0x12, false, regOp, arg); //SSE3 movddup + WriteSSEOp(0xF2, 0x12, regOp, arg); //SSE3 movddup } else { @@ -1603,53 +1605,52 @@ void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg) //There are a few more left // Also some integer instructions are missing -void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x6B, true, dest, arg);} -void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x63, true, dest, arg);} -void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x67, true, dest, arg);} +void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);} +void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);} +void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);} -void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x60, true, dest, arg);} -void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x61, true, dest, arg);} -void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x62, true, dest, arg);} -//void PUNPCKLQDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x60, true, dest, arg);} +void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);} +void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);} +void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);} void XEmitter::PSRLW(X64Reg reg, int shift) { - WriteSSEOp(64, 0x71, true, (X64Reg)2, R(reg)); + WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg)); Write8(shift); } void XEmitter::PSRLD(X64Reg reg, int shift) { - WriteSSEOp(64, 0x72, true, (X64Reg)2, R(reg)); + WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg)); Write8(shift); } void XEmitter::PSRLQ(X64Reg reg, int shift) { - WriteSSEOp(64, 0x73, true, (X64Reg)2, R(reg)); + WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg)); Write8(shift); } void XEmitter::PSRLQ(X64Reg reg, OpArg arg) { - WriteSSEOp(64, 0xd3, true, reg, arg); + WriteSSEOp(0x66, 0xd3, reg, arg); } void XEmitter::PSLLW(X64Reg reg, int shift) { - WriteSSEOp(64, 0x71, true, (X64Reg)6, R(reg)); + WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg)); Write8(shift); } void XEmitter::PSLLD(X64Reg reg, int shift) { - WriteSSEOp(64, 0x72, true, (X64Reg)6, R(reg)); + WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg)); Write8(shift); } void XEmitter::PSLLQ(X64Reg reg, int shift) { - WriteSSEOp(64, 0x73, true, (X64Reg)6, R(reg)); + WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg)); Write8(shift); } @@ -1677,100 +1678,109 @@ void XEmitter::PSRAD(X64Reg reg, int shift) Write8(shift); } -void XEmitter::WriteSSSE3Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes) +void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) { if (!cpu_info.bSSSE3) PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer."); - WriteSSEOp(size, sseOp, packed, regOp, arg, extrabytes); + WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); } -void XEmitter::WriteSSE41Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes) +void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) { if (!cpu_info.bSSE4_1) PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer."); - WriteSSEOp(size, sseOp, packed, regOp, arg, extrabytes); + WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); } -void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(64, 0x3800, true, dest, arg);} -void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3817, true, dest, arg);} -void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x382b, true, dest, arg);} +void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);} +void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);} +void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);} -void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3820, true, dest, arg);} -void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3821, true, dest, arg);} -void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3822, true, dest, arg);} -void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3823, true, dest, arg);} -void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3824, true, dest, arg);} -void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3825, true, dest, arg);} -void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3830, true, dest, arg);} -void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3831, true, dest, arg);} -void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3832, true, dest, arg);} -void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3833, true, dest, arg);} -void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3834, true, dest, arg);} -void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3835, true, dest, arg);} +void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);} +void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);} +void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);} +void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);} +void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);} +void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);} +void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);} +void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);} +void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);} +void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);} +void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);} +void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);} -void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3810, true, dest, arg);} -void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3814, true, dest, arg);} -void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3815, true, dest, arg);} +void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);} +void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);} +void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);} -void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);} -void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDF, true, dest, arg);} -void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEF, true, dest, arg);} -void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEB, true, dest, arg);} +void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);} +void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);} +void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEF, dest, arg);} +void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEB, dest, arg);} -void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFC, true, dest, arg);} -void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFD, true, dest, arg);} -void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFE, true, dest, arg);} -void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD4, true, dest, arg);} +void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFC, dest, arg);} +void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFD, dest, arg);} +void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFE, dest, arg);} +void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD4, dest, arg);} -void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEC, true, dest, arg);} -void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xED, true, dest, arg);} -void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDC, true, dest, arg);} -void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDD, true, dest, arg);} +void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEC, dest, arg);} +void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xED, dest, arg);} +void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDC, dest, arg);} +void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDD, dest, arg);} -void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF8, true, dest, arg);} -void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF9, true, dest, arg);} -void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFA, true, dest, arg);} -void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFB, true, dest, arg);} +void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF8, dest, arg);} +void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF9, dest, arg);} +void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFA, dest, arg);} +void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFB, dest, arg);} -void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE8, true, dest, arg);} -void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE9, true, dest, arg);} -void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD8, true, dest, arg);} -void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD9, true, dest, arg);} +void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE8, dest, arg);} +void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE9, dest, arg);} +void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD8, dest, arg);} +void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD9, dest, arg);} -void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE0, true, dest, arg);} -void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE3, true, dest, arg);} +void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE0, dest, arg);} +void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE3, dest, arg);} -void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x74, true, dest, arg);} -void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x75, true, dest, arg);} -void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x76, true, dest, arg);} +void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x74, dest, arg);} +void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x75, dest, arg);} +void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x76, dest, arg);} -void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x64, true, dest, arg);} -void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x65, true, dest, arg);} -void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x66, true, dest, arg);} +void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x64, dest, arg);} +void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x65, dest, arg);} +void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x66, dest, arg);} -void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(64, 0xC5, true, dest, arg); Write8(subreg);} -void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(64, 0xC4, true, dest, arg); Write8(subreg);} +void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg); Write8(subreg);} +void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg); Write8(subreg);} -void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF5, true, dest, arg); } -void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF6, true, dest, arg);} +void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF5, dest, arg); } +void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF6, dest, arg);} -void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEE, true, dest, arg); } -void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDE, true, dest, arg); } -void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEA, true, dest, arg); } -void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDA, true, dest, arg); } +void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEE, dest, arg); } +void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDE, dest, arg); } +void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEA, dest, arg); } +void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDA, dest, arg); } -void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD7, true, dest, arg); } +void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD7, dest, arg); } -void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, 0x70, false, regOp, arg, 1); Write8(shuffle);} +void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);} // VEX -void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseADD, false, regOp1, regOp2, arg);} -void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseSUB, false, regOp1, regOp2, arg);} -void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);} -void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);} -void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);} -void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseAND, false, regOp1, regOp2, arg);} -void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseANDN, false, regOp1, regOp2, arg);} +void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);} +void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);} +void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);} +void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);} +void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);} +void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);} +void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);} +void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);} +void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);} +void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg);} +void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);} +void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);} +void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);} +void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);} +void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);} +void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);} void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);} void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);} diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index a6e89d278e..9f41194edc 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -126,6 +126,11 @@ struct OpArg //if scale == 0 never mind offsetting offset = _offset; } + bool operator==(OpArg b) + { + return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg && + indexReg == b.indexReg && offset == b.offset; + } void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const; void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const; void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const; @@ -273,11 +278,11 @@ private: void WriteShift(int bits, OpArg dest, OpArg &shift, int ext); void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext); void WriteMXCSR(OpArg arg, int ext); - void WriteSSEOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0); - void WriteSSSE3Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0); - void WriteSSE41Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0); - void WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0); - void WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); + void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); @@ -725,9 +730,18 @@ public: void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg); void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle); + void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); // VEX GPR instructions void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index de051b4bf9..3caf43f8f6 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -728,6 +728,26 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc SetJumpTarget(noBreakpoint); } + // If we have an input register that is going to be used again, load it pre-emptively, + // even if the instruction doesn't strictly need it in a register, to avoid redundant + // loads later. Of course, don't do this if we're already out of registers. + // As a bit of a heuristic, make sure we have at least one register left over for the + // output, which needs to be bound in the actual instruction compilation. + // TODO: make this smarter in the case that we're actually register-starved, i.e. + // prioritize the more important registers. + for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++) + { + int reg = ops[i].regsIn[k]; + if (reg >= 0 && (ops[i].gprInUse & (1 << reg)) && !gpr.R(reg).IsImm()) + gpr.BindToRegister(reg, true, false); + } + for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++) + { + int reg = ops[i].fregsIn[k]; + if (reg >= 0 && (ops[i].fprInXmm & (1 << reg))) + fpr.BindToRegister(reg, true, false); + } + Jit64Tables::CompileInstruction(ops[i]); // If we have a register that will never be used again, flush it. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index cb3ef5b0a9..4dfbe56eb8 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -140,10 +140,13 @@ public: void MultiplyImmediate(u32 imm, int a, int d, bool overflow); - void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); + void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), + void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); typedef u32 (*Operation)(u32 a, u32 b); - void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); - void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); + void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), + bool Rc = false, bool carry = false); + void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), + void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); void FloatCompare(UGeckoInstruction inst, bool upper = false); // OPCODES diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index 11eb9de2c7..622d0b535d 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -108,7 +108,22 @@ X64Reg RegCache::GetFreeXReg() return (X64Reg)xr; } } - //Okay, not found :( Force grab one + // Okay, not found :( Force grab one! + + // First, see if we have any registers that are only going to be used for a float store. + // These go through GPRs, so the cost of tossing them back into memory is lower than anything else. + for (size_t i = 0; i < aCount; i++) + { + X64Reg xr = (X64Reg)aOrder[i]; + if (xregs[xr].locked) + continue; + size_t preg = xregs[xr].ppcReg; + if (!regs[preg].locked && !(jit->js.op->fprInXmm & (1 << preg))) + { + StoreFromRegister(preg); + return xr; + } + } //TODO - add a pass to grab xregs whose ppcreg is not used in the next 3 instructions u32 last_used = 0xFFFFFFFF; @@ -366,3 +381,14 @@ void RegCache::Flush(FlushMode mode) cur_use_quantum = 0; } + +int RegCache::NumFreeRegisters() +{ + int count = 0; + size_t aCount; + const int* aOrder = GetAllocationOrder(aCount); + for (size_t i = 0; i < aCount; i++) + if (!xregs[aOrder[i]].locked && xregs[aOrder[i]].free) + count++; + return count; +} diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h index 7a79086f54..0fe3e9fe5f 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h @@ -123,6 +123,7 @@ public: Gen::X64Reg GetFreeXReg(); + int NumFreeRegisters(); }; class GPRRegCache : public RegCache diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index fe321d4e63..e7086b59b9 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -14,65 +14,27 @@ static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000 static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; -void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS) +void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), + void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) { fpr.Lock(d, a, b); + fpr.BindToRegister(d, d == a || d == b || !single); if (roundRHS) { if (d == a) { - fpr.BindToRegister(d, true); - MOVSD(XMM0, fpr.R(b)); - Force25BitPrecision(XMM0, XMM1); - (this->*op)(fpr.RX(d), R(XMM0)); + Force25BitPrecision(XMM0, fpr.R(b), XMM1); + (this->*sseOp)(fpr.RX(d), R(XMM0)); } else { - fpr.BindToRegister(d, d == b); - if (d != b) - MOVSD(fpr.RX(d), fpr.R(b)); - Force25BitPrecision(fpr.RX(d), XMM0); - (this->*op)(fpr.RX(d), fpr.R(a)); - } - } - else if (d == a) - { - fpr.BindToRegister(d, true); - if (!single) - { - fpr.BindToRegister(b, true, false); - } - (this->*op)(fpr.RX(d), fpr.R(b)); - } - else if (d == b) - { - if (reversible) - { - fpr.BindToRegister(d, true); - if (!single) - { - fpr.BindToRegister(a, true, false); - } - (this->*op)(fpr.RX(d), fpr.R(a)); - } - else - { - MOVSD(XMM0, fpr.R(b)); - fpr.BindToRegister(d, !single); - MOVSD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), Gen::R(XMM0)); + Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0); + (this->*sseOp)(fpr.RX(d), fpr.R(a)); } } else { - // Sources different from d, can use rather quick solution - fpr.BindToRegister(d, !single); - if (!single) - { - fpr.BindToRegister(b, true, false); - } - MOVSD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), fpr.R(b)); + avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible); } if (single) { @@ -104,10 +66,10 @@ void Jit64::fp_arith(UGeckoInstruction inst) bool single = inst.OPCD == 59; switch (inst.SUBOP5) { - case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD, inst); break; //div - case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD, inst); break; //sub - case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, inst); break; //add - case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, inst, single); break; //mul + case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div + case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub + case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add + case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul default: _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } @@ -131,18 +93,20 @@ void Jit64::fmaddXX(UGeckoInstruction inst) // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately if (inst.SUBOP5 == 30) //nmsub { - MOVSD(XMM1, fpr.R(c)); if (single_precision) - Force25BitPrecision(XMM1, XMM0); + Force25BitPrecision(XMM1, fpr.R(c), XMM0); + else + MOVSD(XMM1, fpr.R(c)); MULSD(XMM1, fpr.R(a)); MOVSD(XMM0, fpr.R(b)); SUBSD(XMM0, R(XMM1)); } else { - MOVSD(XMM0, fpr.R(c)); if (single_precision) - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, fpr.R(c), XMM1); + else + MOVSD(XMM0, fpr.R(c)); MULSD(XMM0, fpr.R(a)); if (inst.SUBOP5 == 28) //msub SUBSD(XMM0, fpr.R(b)); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 7df0302e64..cd069bb9fc 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -43,17 +43,15 @@ void Jit64::ps_sel(UGeckoInstruction inst) if (cpu_info.bSSE4_1) { - MOVAPD(XMM1, fpr.R(a)); PXOR(XMM0, R(XMM0)); - CMPPD(XMM0, R(XMM1), NLE); + CMPPD(XMM0, fpr.R(a), NLE); MOVAPD(XMM1, fpr.R(c)); BLENDVPD(XMM1, fpr.R(b)); } else { - MOVAPD(XMM0, fpr.R(a)); PXOR(XMM1, R(XMM1)); - CMPPD(XMM1, R(XMM0), NLE); + CMPPD(XMM1, fpr.R(a), NLE); MOVAPD(XMM0, R(XMM1)); PAND(XMM1, fpr.R(b)); PANDN(XMM0, fpr.R(c)); @@ -74,26 +72,18 @@ void Jit64::ps_sign(UGeckoInstruction inst) int b = inst.FB; fpr.Lock(d, b); - if (d != b) - { - fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), fpr.R(b)); - } - else - { - fpr.BindToRegister(d, true); - } + fpr.BindToRegister(d, d == b); switch (inst.SUBOP10) { case 40: //neg - PXOR(fpr.RX(d), M((void*)&psSignBits)); + avx_op(&XEmitter::VPXOR, &XEmitter::PXOR, fpr.RX(d), fpr.R(b), M((void*)&psSignBits)); break; case 136: //nabs - POR(fpr.RX(d), M((void*)&psSignBits)); + avx_op(&XEmitter::VPOR, &XEmitter::POR, fpr.RX(d), fpr.R(b), M((void*)&psSignBits)); break; case 264: //abs - PAND(fpr.RX(d), M((void*)&psAbsMask)); + avx_op(&XEmitter::VPAND, &XEmitter::PAND, fpr.RX(d), fpr.R(b), M((void*)&psAbsMask)); break; } @@ -101,56 +91,29 @@ void Jit64::ps_sign(UGeckoInstruction inst) } //There's still a little bit more optimization that can be squeezed out of this -void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) +void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) { fpr.Lock(d, a, b); + fpr.BindToRegister(d, d == a || d == b); if (roundRHS) { if (d == a) { - fpr.BindToRegister(d, true); - MOVAPD(XMM0, fpr.R(b)); - Force25BitPrecision(XMM0, XMM1); - (this->*op)(fpr.RX(d), R(XMM0)); + Force25BitPrecision(XMM0, fpr.R(b), XMM1); + (this->*sseOp)(fpr.RX(d), R(XMM0)); } else { - fpr.BindToRegister(d, d == b); - if (d != b) - MOVAPD(fpr.RX(d), fpr.R(b)); - Force25BitPrecision(fpr.RX(d), XMM0); - (this->*op)(fpr.RX(d), fpr.R(a)); - } - } - else if (d == a) - { - fpr.BindToRegister(d, true); - (this->*op)(fpr.RX(d), fpr.R(b)); - } - else if (d == b) - { - if (reversible) - { - fpr.BindToRegister(d, true); - (this->*op)(fpr.RX(d), fpr.R(a)); - } - else - { - MOVAPD(XMM0, fpr.R(b)); - fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), R(XMM0)); + Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0); + (this->*sseOp)(fpr.RX(d), fpr.R(a)); } } else { - //sources different from d, can use rather quick solution - fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), fpr.R(b)); + avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), true, reversible); } - ForceSinglePrecisionP(fpr.RX(d)); + ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } @@ -164,16 +127,16 @@ void Jit64::ps_arith(UGeckoInstruction inst) switch (inst.SUBOP5) { case 18: // div - tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD, inst); + tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VDIVPD, &XEmitter::DIVPD, inst); break; case 20: // sub - tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD, inst); + tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VSUBPD, &XEmitter::SUBPD, inst); break; case 21: // add - tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD, inst); + tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst); break; case 25: // mul - tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, inst, true); + tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true); break; default: _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); @@ -208,10 +171,9 @@ void Jit64::ps_sum(UGeckoInstruction inst) default: PanicAlert("ps_sum WTF!!!"); } - ForceSinglePrecisionP(XMM0); - SetFPRFIfNeeded(inst, XMM0); fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), R(XMM0)); + ForceSinglePrecisionP(fpr.RX(d), XMM0); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } @@ -232,18 +194,16 @@ void Jit64::ps_muls(UGeckoInstruction inst) MOVDDUP(XMM0, fpr.R(c)); break; case 13: - MOVAPD(XMM0, fpr.R(c)); - SHUFPD(XMM0, R(XMM0), 3); + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); break; default: PanicAlert("ps_muls WTF!!!"); } - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, R(XMM0), XMM1); MULPD(XMM0, fpr.R(a)); - ForceSinglePrecisionP(XMM0); - SetFPRFIfNeeded(inst, XMM0); fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), R(XMM0)); + ForceSinglePrecisionP(fpr.RX(d), XMM0); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } @@ -258,27 +218,25 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst) int a = inst.FA; int b = inst.FB; fpr.Lock(a,b,d); + fpr.BindToRegister(d, d == a || d == b); - MOVAPD(XMM0, fpr.R(a)); switch (inst.SUBOP10) { case 528: - UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf + avx_op(&XEmitter::VUNPCKLPD, &XEmitter::UNPCKLPD, fpr.RX(d), fpr.R(a), fpr.R(b)); break; //00 case 560: - SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, fpr.RX(d), fpr.R(a), fpr.R(b), 2); break; //01 case 592: - SHUFPD(XMM0, fpr.R(b), 1); + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, fpr.RX(d), fpr.R(a), fpr.R(b), 1); break; //10 case 624: - UNPCKHPD(XMM0, fpr.R(b)); + avx_op(&XEmitter::VUNPCKHPD, &XEmitter::UNPCKHPD, fpr.RX(d), fpr.R(a), fpr.R(b)); break; //11 default: _assert_msg_(DYNA_REC, 0, "ps_merge - invalid op"); } - fpr.BindToRegister(d, false); - MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); } @@ -303,7 +261,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst) CALL((void *)asm_routines.frsqrte); MOVLHPS(fpr.RX(d), XMM0); - ForceSinglePrecisionP(fpr.RX(d)); + ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); gpr.UnlockAllX(); @@ -330,7 +288,7 @@ void Jit64::ps_res(UGeckoInstruction inst) CALL((void *)asm_routines.fres); MOVLHPS(fpr.RX(d), XMM0); - ForceSinglePrecisionP(fpr.RX(d)); + ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); gpr.UnlockAllX(); @@ -352,42 +310,35 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) switch (inst.SUBOP5) { case 14: //madds0 - MOVDDUP(XMM1, fpr.R(c)); - Force25BitPrecision(XMM1, XMM0); - MOVAPD(XMM0, fpr.R(a)); - MULPD(XMM0, R(XMM1)); + MOVDDUP(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, R(XMM0), XMM1); + MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); break; case 15: //madds1 - MOVAPD(XMM1, fpr.R(c)); - SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower - Force25BitPrecision(XMM1, XMM0); - MOVAPD(XMM0, fpr.R(a)); - MULPD(XMM0, R(XMM1)); + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); + Force25BitPrecision(XMM0, R(XMM0), XMM1); + MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); break; case 28: //msub - MOVAPD(XMM0, fpr.R(c)); - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, fpr.R(c), XMM1); MULPD(XMM0, fpr.R(a)); SUBPD(XMM0, fpr.R(b)); break; case 29: //madd - MOVAPD(XMM0, fpr.R(c)); - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, fpr.R(c), XMM1); MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); break; case 30: //nmsub - MOVAPD(XMM0, fpr.R(c)); - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, fpr.R(c), XMM1); MULPD(XMM0, fpr.R(a)); SUBPD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits)); break; case 31: //nmadd - MOVAPD(XMM0, fpr.R(c)); - Force25BitPrecision(XMM0, XMM1); + Force25BitPrecision(XMM0, fpr.R(c), XMM1); MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits)); @@ -399,9 +350,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) return; } fpr.BindToRegister(d, false); - ForceSinglePrecisionP(XMM0); - SetFPRFIfNeeded(inst, XMM0); - MOVAPD(fpr.RX(d), R(XMM0)); + ForceSinglePrecisionP(fpr.RX(d), XMM0); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index cfb92bd466..a4036ab172 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -409,8 +409,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() const u8* storeSingleU8 = AlignCode4(); // Used by MKWii SHR(32, R(RSCRATCH2), Imm8(5)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULSS(XMM0, R(XMM1)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); XORPS(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); MINSS(XMM0, M((void *)&m_255)); @@ -420,8 +419,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() const u8* storeSingleS8 = AlignCode4(); SHR(32, R(RSCRATCH2), Imm8(5)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULSS(XMM0, R(XMM1)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MAXSS(XMM0, M((void *)&m_m128)); MINSS(XMM0, M((void *)&m_127)); CVTTSS2SI(RSCRATCH, R(XMM0)); @@ -430,8 +428,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() const u8* storeSingleU16 = AlignCode4(); // Used by MKWii SHR(32, R(RSCRATCH2), Imm8(5)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULSS(XMM0, R(XMM1)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); XORPS(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); MINSS(XMM0, M((void *)&m_65535)); @@ -441,8 +438,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() const u8* storeSingleS16 = AlignCode4(); SHR(32, R(RSCRATCH2), Imm8(5)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); - MULSS(XMM0, R(XMM1)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MAXSS(XMM0, M((void *)&m_m32768)); MINSS(XMM0, M((void *)&m_32767)); CVTTSS2SI(RSCRATCH, R(XMM0)); @@ -543,8 +539,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); SHR(32, R(RSCRATCH2), Imm8(5)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULSS(XMM0, R(XMM1)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); @@ -583,8 +578,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true); CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); SHR(32, R(RSCRATCH2), Imm8(5)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULSS(XMM0, R(XMM1)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); @@ -618,8 +612,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false); CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); SHR(32, R(RSCRATCH2), Imm8(5)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULSS(XMM0, R(XMM1)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); @@ -652,8 +645,7 @@ void CommonAsmRoutines::GenQuantizedLoads() UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true); CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); SHR(32, R(RSCRATCH2), Imm8(5)); - MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); - MULSS(XMM0, R(XMM1)); + MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 283ea6ea2e..45f37b6889 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -608,13 +608,98 @@ void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm) } } -void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm) +void EmuCodeBlock::ForceSinglePrecisionP(X64Reg output, X64Reg input) { // Most games don't need these. Zelda requires it though - some platforms get stuck without them. if (jit->jo.accurateSinglePrecision) { - CVTPD2PS(xmm, R(xmm)); - CVTPS2PD(xmm, R(xmm)); + CVTPD2PS(input, R(input)); + CVTPS2PD(output, R(input)); + } + else if (output != input) + { + MOVAPD(output, R(input)); + } +} + +// Abstract between AVX and SSE: automatically handle 3-operand instructions +void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg), + X64Reg regOp, OpArg arg1, OpArg arg2, bool packed, bool reversible) +{ + if (arg1.IsSimpleReg() && regOp == arg1.GetSimpleReg()) + { + (this->*sseOp)(regOp, arg2); + } + else if (arg1.IsSimpleReg() && cpu_info.bAVX) + { + (this->*avxOp)(regOp, arg1.GetSimpleReg(), arg2); + } + else if (arg2.IsSimpleReg() && arg2.GetSimpleReg() == regOp) + { + if (reversible) + { + (this->*sseOp)(regOp, arg1); + } + else + { + // The ugly case: regOp == arg2 without AVX, or with arg1 == memory + if (!arg1.IsSimpleReg() || arg1.GetSimpleReg() != XMM0) + MOVAPD(XMM0, arg1); + if (cpu_info.bAVX) + { + (this->*avxOp)(regOp, XMM0, arg2); + } + else + { + (this->*sseOp)(XMM0, arg2); + if (packed) + MOVAPD(regOp, R(XMM0)); + else + MOVSD(regOp, R(XMM0)); + } + } + } + else + { + if (packed) + MOVAPD(regOp, arg1); + else + MOVSD(regOp, arg1); + (this->*sseOp)(regOp, arg1 == arg2 ? R(regOp) : arg2); + } +} + +// Abstract between AVX and SSE: automatically handle 3-operand instructions +void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg, u8), void (XEmitter::*sseOp)(X64Reg, OpArg, u8), + X64Reg regOp, OpArg arg1, OpArg arg2, u8 imm) +{ + if (arg1.IsSimpleReg() && regOp == arg1.GetSimpleReg()) + { + (this->*sseOp)(regOp, arg2, imm); + } + else if (arg1.IsSimpleReg() && cpu_info.bAVX) + { + (this->*avxOp)(regOp, arg1.GetSimpleReg(), arg2, imm); + } + else if (arg2.IsSimpleReg() && arg2.GetSimpleReg() == regOp) + { + // The ugly case: regOp == arg2 without AVX, or with arg1 == memory + if (!arg1.IsSimpleReg() || arg1.GetSimpleReg() != XMM0) + MOVAPD(XMM0, arg1); + if (cpu_info.bAVX) + { + (this->*avxOp)(regOp, XMM0, arg2, imm); + } + else + { + (this->*sseOp)(XMM0, arg2, imm); + MOVAPD(regOp, R(XMM0)); + } + } + else + { + MOVAPD(regOp, arg1); + (this->*sseOp)(regOp, arg1 == arg2 ? R(regOp) : arg2, imm); } } @@ -625,15 +710,25 @@ static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000}; // a single precision multiply. To be precise, it drops the low 28 bits of the mantissa, // rounding to nearest as it does. // It needs a temp, so let the caller pass that in. -void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp) +void EmuCodeBlock::Force25BitPrecision(X64Reg output, OpArg input, X64Reg tmp) { if (jit->jo.accurateSinglePrecision) { // mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1); - MOVAPD(tmp, R(xmm)); - PAND(xmm, M((void*)&psMantissaTruncate)); - PAND(tmp, M((void*)&psRoundBit)); - PADDQ(xmm, R(tmp)); + if (input.IsSimpleReg() && cpu_info.bAVX) + { + VPAND(tmp, input.GetSimpleReg(), M((void*)&psRoundBit)); + VPAND(output, input.GetSimpleReg(), M((void*)&psMantissaTruncate)); + PADDQ(output, R(tmp)); + } + else + { + if (!input.IsSimpleReg() || input.GetSimpleReg() != output) + MOVAPD(output, input); + avx_op(&XEmitter::VPAND, &XEmitter::PAND, tmp, R(output), M((void*)&psRoundBit), true, true); + PAND(output, M((void*)&psMantissaTruncate)); + PADDQ(output, R(tmp)); + } } } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index e46621067a..43b54debd9 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -123,9 +123,14 @@ public: void JitSetCAIf(Gen::CCFlags conditionCode); void JitClearCA(); + void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), + Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, bool packed = true, bool reversible = false); + void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8), + Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm); + void ForceSinglePrecisionS(Gen::X64Reg xmm); - void ForceSinglePrecisionP(Gen::X64Reg xmm); - void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp); + void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input); + void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp); // RSCRATCH might get trashed void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 923e621dcd..e0c76192b6 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -802,6 +802,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 bool wantsCA = true; u32 fregInUse = 0; u32 regInUse = 0; + u32 fregInXmm = 0; for (int i = block->m_num_instructions - 1; i >= 0; i--) { bool opWantsCR0 = code[i].wantsCR0; @@ -822,6 +823,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 wantsCA &= !code[i].outputCA || opWantsCA; code[i].gprInUse = regInUse; code[i].fprInUse = fregInUse; + code[i].fprInXmm = fregInXmm; // TODO: if there's no possible endblocks or exceptions in between, tell the regcache // we can throw away a register if it's going to be overwritten later. for (int j = 0; j < 3; j++) @@ -829,7 +831,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 regInUse |= 1 << code[i].regsIn[j]; for (int j = 0; j < 4; j++) if (code[i].fregsIn[j] >= 0) + { fregInUse |= 1 << code[i].fregsIn[j]; + if (strncmp(code[i].opinfo->opname, "stfd", 4)) + fregInXmm |= 1 << code[i].fregsIn[j]; + } // For now, we need to count output registers as "used" though; otherwise the flush // will result in a redundant store (e.g. store to regcache, then store again to // the same location later). @@ -837,7 +843,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 if (code[i].regsOut[j] >= 0) regInUse |= 1 << code[i].regsOut[j]; if (code[i].fregOut >= 0) + { fregInUse |= 1 << code[i].fregOut; + if (strncmp(code[i].opinfo->opname, "stfd", 4)) + fregInXmm |= 1 << code[i].fregOut; + } } return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 4936f854c5..a591c7f489 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -45,6 +45,9 @@ struct CodeOp //16B // which registers are still needed after this instruction in this block u32 gprInUse; u32 fprInUse; + // we do double stores from GPRs, so we don't want to load a PowerPC floating point register into + // an XMM only to move it again to a GPR afterwards. + u32 fprInXmm; }; struct BlockStats