JIT64: use FMA instructions
This commit is contained in:
parent
4289221584
commit
9734f0c834
|
@ -49,6 +49,7 @@
|
|||
#include "Core/HW/Wiimote.h"
|
||||
#include "Core/IPC_HLE/WII_IPC_HLE_Device_usb.h"
|
||||
#include "Core/IPC_HLE/WII_Socket.h"
|
||||
#include "Core/PowerPC/JitInterface.h"
|
||||
#include "Core/PowerPC/PowerPC.h"
|
||||
|
||||
#ifdef USE_GDBSTUB
|
||||
|
@ -728,6 +729,8 @@ void UpdateWantDeterminism(bool initial)
|
|||
g_want_determinism = new_want_determinism;
|
||||
WiiSockMan::GetInstance().UpdateWantDeterminism(new_want_determinism);
|
||||
g_video_backend->UpdateWantDeterminism(new_want_determinism);
|
||||
// We need to clear the cache because some parts of the JIT depend on want_determinism, e.g. use of FMA.
|
||||
JitInterface::ClearCache();
|
||||
|
||||
Core::PauseAndLock(false, was_unpaused);
|
||||
}
|
||||
|
|
|
@ -90,9 +90,44 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
|
||||
fpr.Lock(a, b, c, d);
|
||||
|
||||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||
if (inst.SUBOP5 == 30) //nmsub
|
||||
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
||||
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
||||
// be extra careful and don't use FMA, even if in theory it might be okay.
|
||||
// Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
|
||||
// to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
|
||||
// instances on different computers giving identical results.
|
||||
if (cpu_info.bFMA && !Core::g_want_determinism)
|
||||
{
|
||||
if (single_precision)
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
else
|
||||
MOVSD(XMM0, fpr.R(c));
|
||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||
// if we have to pick one of a or b to bind, let's make it b.
|
||||
fpr.BindToRegister(b, true, false);
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 28: //msub
|
||||
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
case 29: //madd
|
||||
VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
// PowerPC and x86 define NMADD/NMSUB differently
|
||||
// x86: D = -A*C (+/-) B
|
||||
// PPC: D = -(A*C (+/-) B)
|
||||
// so we have to swap them; the ADD/SUB here isn't a typo.
|
||||
case 30: //nmsub
|
||||
VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (inst.SUBOP5 == 30) //nmsub
|
||||
{
|
||||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||
if (single_precision)
|
||||
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
||||
else
|
||||
|
@ -115,6 +150,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
if (inst.SUBOP5 == 31) //nmadd
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
}
|
||||
|
||||
fpr.BindToRegister(d, false);
|
||||
//YES it is necessary to dupe the result :(
|
||||
//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
|
||||
|
|
|
@ -305,50 +305,77 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
|||
int b = inst.FB;
|
||||
int c = inst.FC;
|
||||
int d = inst.FD;
|
||||
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
|
||||
fpr.Lock(a,b,c,d);
|
||||
|
||||
switch (inst.SUBOP5)
|
||||
if (fma)
|
||||
fpr.BindToRegister(b, true, false);
|
||||
|
||||
if (inst.SUBOP5 == 14)
|
||||
{
|
||||
case 14: //madds0
|
||||
MOVDDUP(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 15: //madds1
|
||||
}
|
||||
else if (inst.SUBOP5 == 15)
|
||||
{
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 28: //msub
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 29: //madd
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 30: //nmsub
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
break;
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
|
||||
//FallBackToInterpreter(inst);
|
||||
//fpr.UnlockAll();
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
}
|
||||
|
||||
if (fma)
|
||||
{
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 14: //madds0
|
||||
case 15: //madds1
|
||||
case 29: //madd
|
||||
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
case 28: //msub
|
||||
VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
case 30: //nmsub
|
||||
VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 14: //madds0
|
||||
case 15: //madds1
|
||||
case 29: //madd
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 28: //msub
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 30: //nmsub
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
break;
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
fpr.BindToRegister(d, false);
|
||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
|
|
Loading…
Reference in New Issue