diff --git a/Source/Core/Common/CPUDetect.h b/Source/Core/Common/CPUDetect.h index 180feabceb..28263ceeb2 100644 --- a/Source/Core/Common/CPUDetect.h +++ b/Source/Core/Common/CPUDetect.h @@ -44,6 +44,7 @@ struct CPUInfo bool bBMI1; bool bBMI2; bool bFMA; + bool bFMA4; bool bAES; // FXSAVE/FXRSTOR bool bFXSR; diff --git a/Source/Core/Common/x64CPUDetect.cpp b/Source/Core/Common/x64CPUDetect.cpp index 461c1bde8a..0e9f32e622 100644 --- a/Source/Core/Common/x64CPUDetect.cpp +++ b/Source/Core/Common/x64CPUDetect.cpp @@ -175,6 +175,7 @@ void CPUInfo::Detect() __cpuid(cpu_id, 0x80000001); if (cpu_id[2] & 1) bLAHFSAHF64 = true; if ((cpu_id[2] >> 5) & 1) bLZCNT = true; + if ((cpu_id[2] >> 16) & 1) bFMA4 = true; if ((cpu_id[3] >> 29) & 1) bLongMode = true; } diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index 2ba467a25c..c1b703b174 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -1437,6 +1437,13 @@ void XEmitter::WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg WriteVEXOp(0x66, 0x3800 | op, regOp1, regOp2, arg, W); } +void XEmitter::WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W) +{ + if (!cpu_info.bFMA4) + PanicAlert("Trying to use FMA4 on a system that doesn't support it. Computer is v. f'n madd."); + WriteVEXOp4(0x66, 0x3A00 | op, dest, regOp1, arg, regOp2, W); +} + void XEmitter::WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes) { CheckFlags(); @@ -1921,6 +1928,32 @@ void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {W void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteFMA3Op(0xA7, regOp1, regOp2, arg, 1);} void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteFMA3Op(0xB7, regOp1, regOp2, arg, 1);} +#define FMA4(name, op) \ +void XEmitter::name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteFMA4Op(op, dest, regOp1, regOp2, arg, 1);} \ +void XEmitter::name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteFMA4Op(op, dest, regOp1, regOp2, arg, 0);} + +FMA4(VFMADDSUBPS, 0x5C) +FMA4(VFMADDSUBPD, 0x5D) +FMA4(VFMSUBADDPS, 0x5E) +FMA4(VFMSUBADDPD, 0x5F) +FMA4(VFMADDPS, 0x68) +FMA4(VFMADDPD, 0x69) +FMA4(VFMADDSS, 0x6A) +FMA4(VFMADDSD, 0x6B) +FMA4(VFMSUBPS, 0x6C) +FMA4(VFMSUBPD, 0x6D) +FMA4(VFMSUBSS, 0x6E) +FMA4(VFMSUBSD, 0x6F) +FMA4(VFNMADDPS, 0x78) +FMA4(VFNMADDPD, 0x79) +FMA4(VFNMADDSS, 0x7A) +FMA4(VFNMADDSD, 0x7B) +FMA4(VFNMSUBPS, 0x7C) +FMA4(VFNMSUBPD, 0x7D) +FMA4(VFNMSUBSS, 0x7E) +FMA4(VFNMSUBSD, 0x7F) +#undef FMA4 + void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);} void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);} void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);} diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 24331c2db1..7b6c9be703 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -291,6 +291,7 @@ private: void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0, int extrabytes = 0); void WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg regOp3, int W = 0); void WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0); + void WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0); void WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); @@ -853,6 +854,32 @@ public: void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + #define FMA4(name) \ + void name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); \ + void name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + + FMA4(VFMADDSUBPS) + FMA4(VFMADDSUBPD) + FMA4(VFMSUBADDPS) + FMA4(VFMSUBADDPD) + FMA4(VFMADDPS) + FMA4(VFMADDPD) + FMA4(VFMADDSS) + FMA4(VFMADDSD) + FMA4(VFMSUBPS) + FMA4(VFMSUBPD) + FMA4(VFMSUBSS) + FMA4(VFMSUBSD) + FMA4(VFNMADDPS) + FMA4(VFNMADDPD) + FMA4(VFNMADDSS) + FMA4(VFNMADDSD) + FMA4(VFNMSUBPS) + FMA4(VFNMSUBPD) + FMA4(VFNMSUBSS) + FMA4(VFNMSUBSD) + #undef FMA4 + // VEX GPR instructions void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index ebaac9f320..032660aa1c 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -184,6 +184,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst) break; } } + else if (cpu_info.bFMA4 && !Core::g_want_determinism) + { + fpr.BindToRegister(b, true, false); + switch (inst.SUBOP5) + { + case 28: //msub + if (packed) + VFMSUBPD(XMM1, XMM1, fpr.R(a), fpr.RX(b)); + else + VFMSUBSD(XMM1, XMM1, fpr.R(a), fpr.RX(b)); + break; + case 14: //madds0 + case 15: //madds1 + case 29: //madd + if (packed) + VFMADDPD(XMM1, XMM1, fpr.R(a), fpr.RX(b)); + else + VFMADDSD(XMM1, XMM1, fpr.R(a), fpr.RX(b)); + break; + case 30: //nmsub + if (packed) + VFNMADDPD(XMM1, XMM1, fpr.R(a), fpr.RX(b)); + else + VFNMADDSD(XMM1, XMM1, fpr.R(a), fpr.RX(b)); + break; + case 31: //nmadd + if (packed) + VFNMSUBPD(XMM1, XMM1, fpr.R(a), fpr.RX(b)); + else + VFNMSUBSD(XMM1, XMM1, fpr.R(a), fpr.RX(b)); + break; + } + } else if (inst.SUBOP5 == 30) //nmsub { // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately. diff --git a/Source/UnitTests/Common/x64EmitterTest.cpp b/Source/UnitTests/Common/x64EmitterTest.cpp index adbdf76a77..6659542dee 100644 --- a/Source/UnitTests/Common/x64EmitterTest.cpp +++ b/Source/UnitTests/Common/x64EmitterTest.cpp @@ -1046,4 +1046,43 @@ FMA3_TEST(VFNMSUB, S, false) FMA3_TEST(VFMADDSUB, P, true) FMA3_TEST(VFMSUBADD, P, true) +// for VEX instructions that take the form op reg, reg, r/m, reg OR reg, reg, reg, r/m +#define VEX_RRMR_RRRM_TEST(Name, sizename) \ + TEST_F(x64EmitterTest, Name) \ + { \ + struct { \ + int bits; \ + std::vector regs; \ + std::string out_name; \ + std::string size; \ + } regsets[] = { \ + { 64, xmmnames, "xmm0", sizename }, \ + }; \ + for (const auto& regset : regsets) \ + for (const auto& r : regset.regs) \ + { \ + emitter->Name(r.reg, XMM0, R(XMM0), r.reg); \ + emitter->Name(XMM0, XMM0, r.reg, MatR(R12)); \ + emitter->Name(XMM0, r.reg, MatR(R12), XMM0); \ + ExpectDisassembly(#Name " " + r.name+ ", " + regset.out_name + ", " + regset.out_name + ", " + r.name + " " \ + #Name " " + regset.out_name + ", " + regset.out_name + ", " + r.name + ", " + regset.size + " ptr ds:[r12] " \ + #Name " " + regset.out_name + ", " + r.name + ", " + regset.size + " ptr ds:[r12], " + regset.out_name); \ + } \ + } + +#define FMA4_TEST(Name, P, packed) \ + VEX_RRMR_RRRM_TEST(Name ## P ## S, packed ? "dqword" : "dword") \ + VEX_RRMR_RRRM_TEST(Name ## P ## D, packed ? "dqword" : "qword") + +FMA4_TEST(VFMADD, P, true) +FMA4_TEST(VFMADD, S, false) +FMA4_TEST(VFMSUB, P, true) +FMA4_TEST(VFMSUB, S, false) +FMA4_TEST(VFNMADD, P, true) +FMA4_TEST(VFNMADD, S, false) +FMA4_TEST(VFNMSUB, P, true) +FMA4_TEST(VFNMSUB, S, false) +FMA4_TEST(VFMADDSUB, P, true) +FMA4_TEST(VFMSUBADD, P, true) + } // namespace Gen