Merge pull request #2496 from Tilka/fma4
Jit64: add FMA4 support to fmaddXX
This commit is contained in:
commit
d3e47dfcf5
|
@ -44,6 +44,7 @@ struct CPUInfo
|
|||
bool bBMI1;
|
||||
bool bBMI2;
|
||||
bool bFMA;
|
||||
bool bFMA4;
|
||||
bool bAES;
|
||||
// FXSAVE/FXRSTOR
|
||||
bool bFXSR;
|
||||
|
|
|
@ -175,6 +175,7 @@ void CPUInfo::Detect()
|
|||
__cpuid(cpu_id, 0x80000001);
|
||||
if (cpu_id[2] & 1) bLAHFSAHF64 = true;
|
||||
if ((cpu_id[2] >> 5) & 1) bLZCNT = true;
|
||||
if ((cpu_id[2] >> 16) & 1) bFMA4 = true;
|
||||
if ((cpu_id[3] >> 29) & 1) bLongMode = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -1437,6 +1437,13 @@ void XEmitter::WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg
|
|||
WriteVEXOp(0x66, 0x3800 | op, regOp1, regOp2, arg, W);
|
||||
}
|
||||
|
||||
void XEmitter::WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W)
|
||||
{
|
||||
if (!cpu_info.bFMA4)
|
||||
PanicAlert("Trying to use FMA4 on a system that doesn't support it. Computer is v. f'n madd.");
|
||||
WriteVEXOp4(0x66, 0x3A00 | op, dest, regOp1, arg, regOp2, W);
|
||||
}
|
||||
|
||||
void XEmitter::WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes)
|
||||
{
|
||||
CheckFlags();
|
||||
|
@ -1921,6 +1928,32 @@ void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {W
|
|||
void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteFMA3Op(0xA7, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteFMA3Op(0xB7, regOp1, regOp2, arg, 1);}
|
||||
|
||||
#define FMA4(name, op) \
|
||||
void XEmitter::name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteFMA4Op(op, dest, regOp1, regOp2, arg, 1);} \
|
||||
void XEmitter::name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteFMA4Op(op, dest, regOp1, regOp2, arg, 0);}
|
||||
|
||||
FMA4(VFMADDSUBPS, 0x5C)
|
||||
FMA4(VFMADDSUBPD, 0x5D)
|
||||
FMA4(VFMSUBADDPS, 0x5E)
|
||||
FMA4(VFMSUBADDPD, 0x5F)
|
||||
FMA4(VFMADDPS, 0x68)
|
||||
FMA4(VFMADDPD, 0x69)
|
||||
FMA4(VFMADDSS, 0x6A)
|
||||
FMA4(VFMADDSD, 0x6B)
|
||||
FMA4(VFMSUBPS, 0x6C)
|
||||
FMA4(VFMSUBPD, 0x6D)
|
||||
FMA4(VFMSUBSS, 0x6E)
|
||||
FMA4(VFMSUBSD, 0x6F)
|
||||
FMA4(VFNMADDPS, 0x78)
|
||||
FMA4(VFNMADDPD, 0x79)
|
||||
FMA4(VFNMADDSS, 0x7A)
|
||||
FMA4(VFNMADDSD, 0x7B)
|
||||
FMA4(VFNMSUBPS, 0x7C)
|
||||
FMA4(VFNMSUBPD, 0x7D)
|
||||
FMA4(VFNMSUBSS, 0x7E)
|
||||
FMA4(VFNMSUBSD, 0x7F)
|
||||
#undef FMA4
|
||||
|
||||
void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);}
|
||||
void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);}
|
||||
void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);}
|
||||
|
|
|
@ -291,6 +291,7 @@ private:
|
|||
void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0, int extrabytes = 0);
|
||||
void WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg regOp3, int W = 0);
|
||||
void WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
|
||||
void WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
|
||||
void WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
|
||||
void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
|
||||
void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
|
||||
|
@ -853,6 +854,32 @@ public:
|
|||
void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
|
||||
void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
|
||||
|
||||
#define FMA4(name) \
|
||||
void name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); \
|
||||
void name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
|
||||
|
||||
FMA4(VFMADDSUBPS)
|
||||
FMA4(VFMADDSUBPD)
|
||||
FMA4(VFMSUBADDPS)
|
||||
FMA4(VFMSUBADDPD)
|
||||
FMA4(VFMADDPS)
|
||||
FMA4(VFMADDPD)
|
||||
FMA4(VFMADDSS)
|
||||
FMA4(VFMADDSD)
|
||||
FMA4(VFMSUBPS)
|
||||
FMA4(VFMSUBPD)
|
||||
FMA4(VFMSUBSS)
|
||||
FMA4(VFMSUBSD)
|
||||
FMA4(VFNMADDPS)
|
||||
FMA4(VFNMADDPD)
|
||||
FMA4(VFNMADDSS)
|
||||
FMA4(VFNMADDSD)
|
||||
FMA4(VFNMSUBPS)
|
||||
FMA4(VFNMSUBPD)
|
||||
FMA4(VFNMSUBSS)
|
||||
FMA4(VFNMSUBSD)
|
||||
#undef FMA4
|
||||
|
||||
// VEX GPR instructions
|
||||
void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
|
||||
void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
|
||||
|
|
|
@ -184,6 +184,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
break;
|
||||
}
|
||||
}
|
||||
else if (cpu_info.bFMA4 && !Core::g_want_determinism)
|
||||
{
|
||||
fpr.BindToRegister(b, true, false);
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 28: //msub
|
||||
if (packed)
|
||||
VFMSUBPD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
|
||||
else
|
||||
VFMSUBSD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
|
||||
break;
|
||||
case 14: //madds0
|
||||
case 15: //madds1
|
||||
case 29: //madd
|
||||
if (packed)
|
||||
VFMADDPD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
|
||||
else
|
||||
VFMADDSD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
|
||||
break;
|
||||
case 30: //nmsub
|
||||
if (packed)
|
||||
VFNMADDPD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
|
||||
else
|
||||
VFNMADDSD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
if (packed)
|
||||
VFNMSUBPD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
|
||||
else
|
||||
VFNMSUBSD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (inst.SUBOP5 == 30) //nmsub
|
||||
{
|
||||
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately.
|
||||
|
|
|
@ -1046,4 +1046,43 @@ FMA3_TEST(VFNMSUB, S, false)
|
|||
FMA3_TEST(VFMADDSUB, P, true)
|
||||
FMA3_TEST(VFMSUBADD, P, true)
|
||||
|
||||
// for VEX instructions that take the form op reg, reg, r/m, reg OR reg, reg, reg, r/m
|
||||
#define VEX_RRMR_RRRM_TEST(Name, sizename) \
|
||||
TEST_F(x64EmitterTest, Name) \
|
||||
{ \
|
||||
struct { \
|
||||
int bits; \
|
||||
std::vector<NamedReg> regs; \
|
||||
std::string out_name; \
|
||||
std::string size; \
|
||||
} regsets[] = { \
|
||||
{ 64, xmmnames, "xmm0", sizename }, \
|
||||
}; \
|
||||
for (const auto& regset : regsets) \
|
||||
for (const auto& r : regset.regs) \
|
||||
{ \
|
||||
emitter->Name(r.reg, XMM0, R(XMM0), r.reg); \
|
||||
emitter->Name(XMM0, XMM0, r.reg, MatR(R12)); \
|
||||
emitter->Name(XMM0, r.reg, MatR(R12), XMM0); \
|
||||
ExpectDisassembly(#Name " " + r.name+ ", " + regset.out_name + ", " + regset.out_name + ", " + r.name + " " \
|
||||
#Name " " + regset.out_name + ", " + regset.out_name + ", " + r.name + ", " + regset.size + " ptr ds:[r12] " \
|
||||
#Name " " + regset.out_name + ", " + r.name + ", " + regset.size + " ptr ds:[r12], " + regset.out_name); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define FMA4_TEST(Name, P, packed) \
|
||||
VEX_RRMR_RRRM_TEST(Name ## P ## S, packed ? "dqword" : "dword") \
|
||||
VEX_RRMR_RRRM_TEST(Name ## P ## D, packed ? "dqword" : "qword")
|
||||
|
||||
FMA4_TEST(VFMADD, P, true)
|
||||
FMA4_TEST(VFMADD, S, false)
|
||||
FMA4_TEST(VFMSUB, P, true)
|
||||
FMA4_TEST(VFMSUB, S, false)
|
||||
FMA4_TEST(VFNMADD, P, true)
|
||||
FMA4_TEST(VFNMADD, S, false)
|
||||
FMA4_TEST(VFNMSUB, P, true)
|
||||
FMA4_TEST(VFNMSUB, S, false)
|
||||
FMA4_TEST(VFMADDSUB, P, true)
|
||||
FMA4_TEST(VFMSUBADD, P, true)
|
||||
|
||||
} // namespace Gen
|
||||
|
|
Loading…
Reference in New Issue