From 9734f0c8349f6a56bf35d8dc4ec0523cb05b305e Mon Sep 17 00:00:00 2001 From: Fiora Date: Sat, 6 Sep 2014 03:41:17 -0700 Subject: [PATCH] JIT64: use FMA instructions --- Source/Core/Core/Core.cpp | 3 + .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 40 +++++++- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 99 ++++++++++++------- 3 files changed, 104 insertions(+), 38 deletions(-) diff --git a/Source/Core/Core/Core.cpp b/Source/Core/Core/Core.cpp index 8bbd97f0be..2b36fa6f3f 100644 --- a/Source/Core/Core/Core.cpp +++ b/Source/Core/Core/Core.cpp @@ -49,6 +49,7 @@ #include "Core/HW/Wiimote.h" #include "Core/IPC_HLE/WII_IPC_HLE_Device_usb.h" #include "Core/IPC_HLE/WII_Socket.h" +#include "Core/PowerPC/JitInterface.h" #include "Core/PowerPC/PowerPC.h" #ifdef USE_GDBSTUB @@ -728,6 +729,8 @@ void UpdateWantDeterminism(bool initial) g_want_determinism = new_want_determinism; WiiSockMan::GetInstance().UpdateWantDeterminism(new_want_determinism); g_video_backend->UpdateWantDeterminism(new_want_determinism); + // We need to clear the cache because some parts of the JIT depend on want_determinism, e.g. use of FMA. + JitInterface::ClearCache(); Core::PauseAndLock(false, was_unpaused); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index e7086b59b9..0b58eb03cb 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -90,9 +90,44 @@ void Jit64::fmaddXX(UGeckoInstruction inst) fpr.Lock(a, b, c, d); - // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately - if (inst.SUBOP5 == 30) //nmsub + // While we don't know if any games are actually affected (replays seem to work with all the usual + // suspects for desyncing), netplay and other applications need absolute perfect determinism, so + // be extra careful and don't use FMA, even if in theory it might be okay. + // Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared + // to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin + // instances on different computers giving identical results. + if (cpu_info.bFMA && !Core::g_want_determinism) { + if (single_precision) + Force25BitPrecision(XMM0, fpr.R(c), XMM1); + else + MOVSD(XMM0, fpr.R(c)); + // Statistics suggests b is a lot less likely to be unbound in practice, so + // if we have to pick one of a or b to bind, let's make it b. + fpr.BindToRegister(b, true, false); + switch (inst.SUBOP5) + { + case 28: //msub + VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); + break; + case 29: //madd + VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); + break; + // PowerPC and x86 define NMADD/NMSUB differently + // x86: D = -A*C (+/-) B + // PPC: D = -(A*C (+/-) B) + // so we have to swap them; the ADD/SUB here isn't a typo. + case 30: //nmsub + VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); + break; + case 31: //nmadd + VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); + break; + } + } + else if (inst.SUBOP5 == 30) //nmsub + { + // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately if (single_precision) Force25BitPrecision(XMM1, fpr.R(c), XMM0); else @@ -115,6 +150,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) if (inst.SUBOP5 == 31) //nmadd PXOR(XMM0, M((void*)&psSignBits)); } + fpr.BindToRegister(d, false); //YES it is necessary to dupe the result :( //TODO : analysis - does the top reg get used? If so, dupe, if not, don't. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index cd069bb9fc..6cb6a20cba 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -305,50 +305,77 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) int b = inst.FB; int c = inst.FC; int d = inst.FD; + bool fma = cpu_info.bFMA && !Core::g_want_determinism; fpr.Lock(a,b,c,d); - switch (inst.SUBOP5) + if (fma) + fpr.BindToRegister(b, true, false); + + if (inst.SUBOP5 == 14) { - case 14: //madds0 MOVDDUP(XMM0, fpr.R(c)); Force25BitPrecision(XMM0, R(XMM0), XMM1); - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - break; - case 15: //madds1 + } + else if (inst.SUBOP5 == 15) + { avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); Force25BitPrecision(XMM0, R(XMM0), XMM1); - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - break; - case 28: //msub - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - MULPD(XMM0, fpr.R(a)); - SUBPD(XMM0, fpr.R(b)); - break; - case 29: //madd - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - break; - case 30: //nmsub - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - MULPD(XMM0, fpr.R(a)); - SUBPD(XMM0, fpr.R(b)); - PXOR(XMM0, M((void*)&psSignBits)); - break; - case 31: //nmadd - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - PXOR(XMM0, M((void*)&psSignBits)); - break; - default: - _assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!"); - //FallBackToInterpreter(inst); - //fpr.UnlockAll(); - return; } + else + { + Force25BitPrecision(XMM0, fpr.R(c), XMM1); + } + + if (fma) + { + switch (inst.SUBOP5) + { + case 14: //madds0 + case 15: //madds1 + case 29: //madd + VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); + break; + case 28: //msub + VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); + break; + case 30: //nmsub + VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); + break; + case 31: //nmadd + VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); + break; + } + } + else + { + switch (inst.SUBOP5) + { + case 14: //madds0 + case 15: //madds1 + case 29: //madd + MULPD(XMM0, fpr.R(a)); + ADDPD(XMM0, fpr.R(b)); + break; + case 28: //msub + MULPD(XMM0, fpr.R(a)); + SUBPD(XMM0, fpr.R(b)); + break; + case 30: //nmsub + MULPD(XMM0, fpr.R(a)); + SUBPD(XMM0, fpr.R(b)); + PXOR(XMM0, M((void*)&psSignBits)); + break; + case 31: //nmadd + MULPD(XMM0, fpr.R(a)); + ADDPD(XMM0, fpr.R(b)); + PXOR(XMM0, M((void*)&psSignBits)); + break; + default: + _assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!"); + return; + } + } + fpr.BindToRegister(d, false); ForceSinglePrecisionP(fpr.RX(d), XMM0); SetFPRFIfNeeded(inst, fpr.RX(d));