Merge pull request #2496 from Tilka/fma4

Jit64: add FMA4 support to fmaddXX
This commit is contained in:
flacs 2015-06-06 17:31:55 +02:00
commit d3e47dfcf5
6 changed files with 134 additions and 0 deletions

View File

@ -44,6 +44,7 @@ struct CPUInfo
bool bBMI1;
bool bBMI2;
bool bFMA;
bool bFMA4;
bool bAES;
// FXSAVE/FXRSTOR
bool bFXSR;

View File

@ -175,6 +175,7 @@ void CPUInfo::Detect()
__cpuid(cpu_id, 0x80000001);
if (cpu_id[2] & 1) bLAHFSAHF64 = true;
if ((cpu_id[2] >> 5) & 1) bLZCNT = true;
if ((cpu_id[2] >> 16) & 1) bFMA4 = true;
if ((cpu_id[3] >> 29) & 1) bLongMode = true;
}

View File

@ -1437,6 +1437,13 @@ void XEmitter::WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg
WriteVEXOp(0x66, 0x3800 | op, regOp1, regOp2, arg, W);
}
void XEmitter::WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W)
{
if (!cpu_info.bFMA4)
PanicAlert("Trying to use FMA4 on a system that doesn't support it. Computer is v. f'n madd.");
WriteVEXOp4(0x66, 0x3A00 | op, dest, regOp1, arg, regOp2, W);
}
void XEmitter::WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes)
{
CheckFlags();
@ -1921,6 +1928,32 @@ void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {W
void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteFMA3Op(0xA7, regOp1, regOp2, arg, 1);}
void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteFMA3Op(0xB7, regOp1, regOp2, arg, 1);}
#define FMA4(name, op) \
void XEmitter::name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) {WriteFMA4Op(op, dest, regOp1, regOp2, arg, 1);} \
void XEmitter::name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteFMA4Op(op, dest, regOp1, regOp2, arg, 0);}
FMA4(VFMADDSUBPS, 0x5C)
FMA4(VFMADDSUBPD, 0x5D)
FMA4(VFMSUBADDPS, 0x5E)
FMA4(VFMSUBADDPD, 0x5F)
FMA4(VFMADDPS, 0x68)
FMA4(VFMADDPD, 0x69)
FMA4(VFMADDSS, 0x6A)
FMA4(VFMADDSD, 0x6B)
FMA4(VFMSUBPS, 0x6C)
FMA4(VFMSUBPD, 0x6D)
FMA4(VFMSUBSS, 0x6E)
FMA4(VFMSUBSD, 0x6F)
FMA4(VFNMADDPS, 0x78)
FMA4(VFNMADDPD, 0x79)
FMA4(VFNMADDSS, 0x7A)
FMA4(VFNMADDSD, 0x7B)
FMA4(VFNMSUBPS, 0x7C)
FMA4(VFNMSUBPD, 0x7D)
FMA4(VFNMSUBSS, 0x7E)
FMA4(VFNMSUBSD, 0x7F)
#undef FMA4
void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);}
void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);}
void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);}

View File

@ -291,6 +291,7 @@ private:
void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0, int extrabytes = 0);
void WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg regOp3, int W = 0);
void WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
void WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
void WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
@ -853,6 +854,32 @@ public:
void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
#define FMA4(name) \
void name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); \
void name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
FMA4(VFMADDSUBPS)
FMA4(VFMADDSUBPD)
FMA4(VFMSUBADDPS)
FMA4(VFMSUBADDPD)
FMA4(VFMADDPS)
FMA4(VFMADDPD)
FMA4(VFMADDSS)
FMA4(VFMADDSD)
FMA4(VFMSUBPS)
FMA4(VFMSUBPD)
FMA4(VFMSUBSS)
FMA4(VFMSUBSD)
FMA4(VFNMADDPS)
FMA4(VFNMADDPD)
FMA4(VFNMADDSS)
FMA4(VFNMADDSD)
FMA4(VFNMSUBPS)
FMA4(VFNMSUBPD)
FMA4(VFNMSUBSS)
FMA4(VFNMSUBSD)
#undef FMA4
// VEX GPR instructions
void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);

View File

@ -184,6 +184,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
break;
}
}
else if (cpu_info.bFMA4 && !Core::g_want_determinism)
{
fpr.BindToRegister(b, true, false);
switch (inst.SUBOP5)
{
case 28: //msub
if (packed)
VFMSUBPD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
else
VFMSUBSD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
break;
case 14: //madds0
case 15: //madds1
case 29: //madd
if (packed)
VFMADDPD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
else
VFMADDSD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
break;
case 30: //nmsub
if (packed)
VFNMADDPD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
else
VFNMADDSD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
break;
case 31: //nmadd
if (packed)
VFNMSUBPD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
else
VFNMSUBSD(XMM1, XMM1, fpr.R(a), fpr.RX(b));
break;
}
}
else if (inst.SUBOP5 == 30) //nmsub
{
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately.

View File

@ -1046,4 +1046,43 @@ FMA3_TEST(VFNMSUB, S, false)
FMA3_TEST(VFMADDSUB, P, true)
FMA3_TEST(VFMSUBADD, P, true)
// for VEX instructions that take the form op reg, reg, r/m, reg OR reg, reg, reg, r/m
#define VEX_RRMR_RRRM_TEST(Name, sizename) \
TEST_F(x64EmitterTest, Name) \
{ \
struct { \
int bits; \
std::vector<NamedReg> regs; \
std::string out_name; \
std::string size; \
} regsets[] = { \
{ 64, xmmnames, "xmm0", sizename }, \
}; \
for (const auto& regset : regsets) \
for (const auto& r : regset.regs) \
{ \
emitter->Name(r.reg, XMM0, R(XMM0), r.reg); \
emitter->Name(XMM0, XMM0, r.reg, MatR(R12)); \
emitter->Name(XMM0, r.reg, MatR(R12), XMM0); \
ExpectDisassembly(#Name " " + r.name+ ", " + regset.out_name + ", " + regset.out_name + ", " + r.name + " " \
#Name " " + regset.out_name + ", " + regset.out_name + ", " + r.name + ", " + regset.size + " ptr ds:[r12] " \
#Name " " + regset.out_name + ", " + r.name + ", " + regset.size + " ptr ds:[r12], " + regset.out_name); \
} \
}
#define FMA4_TEST(Name, P, packed) \
VEX_RRMR_RRRM_TEST(Name ## P ## S, packed ? "dqword" : "dword") \
VEX_RRMR_RRRM_TEST(Name ## P ## D, packed ? "dqword" : "qword")
FMA4_TEST(VFMADD, P, true)
FMA4_TEST(VFMADD, S, false)
FMA4_TEST(VFMSUB, P, true)
FMA4_TEST(VFMSUB, S, false)
FMA4_TEST(VFNMADD, P, true)
FMA4_TEST(VFNMADD, S, false)
FMA4_TEST(VFNMSUB, P, true)
FMA4_TEST(VFNMSUB, S, false)
FMA4_TEST(VFMADDSUB, P, true)
FMA4_TEST(VFMSUBADD, P, true)
} // namespace Gen