From aec38466d9c68a735da61786053030d4b333bcf0 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Sun, 7 Jun 2015 14:38:09 +0200 Subject: [PATCH 1/2] Jit64: optionally accurate NaNs When AccurateNaNs is enabled, NaNs are handled accurately by checking for NaN results and choosing the correct input NaN or replacing x86's generated -QNaN with +QNaN. --- Source/Core/Core/BootManager.cpp | 5 +- Source/Core/Core/CoreParameter.cpp | 3 +- Source/Core/Core/CoreParameter.h | 1 + Source/Core/Core/PowerPC/Jit64/Jit.h | 13 +- Source/Core/Core/PowerPC/Jit64/JitRegCache.h | 14 ++ .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 167 +++++++++++++++--- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 22 +-- 7 files changed, 183 insertions(+), 42 deletions(-) diff --git a/Source/Core/Core/BootManager.cpp b/Source/Core/Core/BootManager.cpp index 10fda6c35e..85396fad4b 100644 --- a/Source/Core/Core/BootManager.cpp +++ b/Source/Core/Core/BootManager.cpp @@ -47,7 +47,7 @@ namespace BootManager // Apply fire liberally struct ConfigCache { - bool valid, bCPUThread, bSkipIdle, bSyncGPUOnSkipIdleHack, bFPRF, bMMU, bDCBZOFF, m_EnableJIT, bDSPThread, + bool valid, bCPUThread, bSkipIdle, bSyncGPUOnSkipIdleHack, bFPRF, bAccurateNaNs, bMMU, bDCBZOFF, m_EnableJIT, bDSPThread, bSyncGPU, bFastDiscSpeed, bDSPHLE, bHLE_BS2, bProgressive; int iCPUCore, Volume; int iWiimoteSource[MAX_BBMOTES]; @@ -106,6 +106,7 @@ bool BootCore(const std::string& _rFilename) config_cache.bSyncGPUOnSkipIdleHack = StartUp.bSyncGPUOnSkipIdleHack; config_cache.iCPUCore = StartUp.iCPUCore; config_cache.bFPRF = StartUp.bFPRF; + config_cache.bAccurateNaNs = StartUp.bAccurateNaNs; config_cache.bMMU = StartUp.bMMU; config_cache.bDCBZOFF = StartUp.bDCBZOFF; config_cache.bSyncGPU = StartUp.bSyncGPU; @@ -146,6 +147,7 @@ bool BootCore(const std::string& _rFilename) core_section->Get("SkipIdle", &StartUp.bSkipIdle, StartUp.bSkipIdle); core_section->Get("SyncOnSkipIdle", &StartUp.bSyncGPUOnSkipIdleHack, StartUp.bSyncGPUOnSkipIdleHack); core_section->Get("FPRF", &StartUp.bFPRF, StartUp.bFPRF); + core_section->Get("AccurateNaNs", &StartUp.bAccurateNaNs, StartUp.bAccurateNaNs); core_section->Get("MMU", &StartUp.bMMU, StartUp.bMMU); core_section->Get("DCBZ", &StartUp.bDCBZOFF, StartUp.bDCBZOFF); core_section->Get("SyncGPU", &StartUp.bSyncGPU, StartUp.bSyncGPU); @@ -273,6 +275,7 @@ void Stop() StartUp.bSyncGPUOnSkipIdleHack = config_cache.bSyncGPUOnSkipIdleHack; StartUp.iCPUCore = config_cache.iCPUCore; StartUp.bFPRF = config_cache.bFPRF; + StartUp.bAccurateNaNs = config_cache.bAccurateNaNs; StartUp.bMMU = config_cache.bMMU; StartUp.bDCBZOFF = config_cache.bDCBZOFF; StartUp.bSyncGPU = config_cache.bSyncGPU; diff --git a/Source/Core/Core/CoreParameter.cpp b/Source/Core/Core/CoreParameter.cpp index b06ca58fce..8acfccbab7 100644 --- a/Source/Core/Core/CoreParameter.cpp +++ b/Source/Core/Core/CoreParameter.cpp @@ -33,7 +33,7 @@ SCoreStartupParameter::SCoreStartupParameter() bJITPairedOff(false), bJITSystemRegistersOff(false), bJITBranchOff(false), bJITILTimeProfiling(false), bJITILOutputIR(false), - bFPRF(false), + bFPRF(false), bAccurateNaNs(false), bCPUThread(true), bDSPThread(false), bDSPHLE(true), bSkipIdle(true), bSyncGPUOnSkipIdleHack(true), bNTSC(false), bForceNTSCJ(false), bHLE_BS2(true), bEnableCheats(false), @@ -78,6 +78,7 @@ void SCoreStartupParameter::LoadDefaults() bDSPHLE = true; bFastmem = true; bFPRF = false; + bAccurateNaNs = false; bMMU = false; bDCBZOFF = false; iBBDumpPort = -1; diff --git a/Source/Core/Core/CoreParameter.h b/Source/Core/Core/CoreParameter.h index 43d758bbad..956ea5f318 100644 --- a/Source/Core/Core/CoreParameter.h +++ b/Source/Core/Core/CoreParameter.h @@ -163,6 +163,7 @@ struct SCoreStartupParameter bool bFastmem; bool bFPRF; + bool bAccurateNaNs; bool bCPUThread; bool bDSPThread; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 391f353e82..a344242e10 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -135,13 +135,18 @@ public: Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); void SetFPRFIfNeeded(Gen::X64Reg xmm); + void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in); + void MultiplyImmediate(u32 imm, int a, int d, bool overflow); typedef u32 (*Operation)(u32 a, u32 b); - void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), - bool Rc = false, bool carry = false); - void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&), - void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&), bool packed = false, bool roundRHS = false); + void regimmop(int d, int a, bool binary, u32 value, Operation doop, + void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), + bool Rc = false, bool carry = false); + Gen::X64Reg fp_tri_op(int d, int a, int b, bool reversible, bool single, + void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&), + void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&), + bool packed, bool preserve_inputs, bool roundRHS = false); void FloatCompare(UGeckoInstruction inst, bool upper = false); // OPCODES diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h index 2c572c3585..05d80ce8f6 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h @@ -138,6 +138,20 @@ public: LockX(args...); } + template + void UnlockX(T x) + { + if (!xregs[x].locked) + PanicAlert("RegCache: x %i already unlocked!", x); + xregs[x].locked = false; + } + template + void UnlockX(T first, Args... args) + { + UnlockX(first); + UnlockX(args...); + } + void UnlockAll(); void UnlockAllX(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index ebaac9f320..c9bed760e2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -10,38 +10,37 @@ using namespace Gen; -static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL}; -static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; -static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL}; -static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; +static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL}; +static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; +static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL}; +static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; +static const u64 GC_ALIGNED16(psGeneratedQNaN[2]) = {0x7FF8000000000000ULL, 0x7FF8000000000000ULL}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; -void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&), - void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool packed, bool roundRHS) +X64Reg Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&), + void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool packed, bool preserve_inputs, bool roundRHS) { fpr.Lock(d, a, b); fpr.BindToRegister(d, d == a || d == b || !single); + X64Reg dest = preserve_inputs ? XMM1 : fpr.RX(d); if (roundRHS) { - if (d == a) + if (d == a && !preserve_inputs) { Force25BitPrecision(XMM0, fpr.R(b), XMM1); (this->*sseOp)(fpr.RX(d), R(XMM0)); } else { - Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0); - (this->*sseOp)(fpr.RX(d), fpr.R(a)); + Force25BitPrecision(dest, fpr.R(b), XMM0); + (this->*sseOp)(dest, fpr.R(a)); } } else { - avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible); + avx_op(avxOp, sseOp, dest, fpr.R(a), fpr.R(b), packed, reversible); } - if (single) - ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true); - SetFPRFIfNeeded(fpr.RX(d)); - fpr.UnlockAll(); + return dest; } // We can avoid calculating FPRF if it's not needed; every float operation resets it, so @@ -56,6 +55,112 @@ void Jit64::SetFPRFIfNeeded(X64Reg xmm) SetFPRF(xmm); } +void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm) +{ + // | PowerPC | x86 + // ---------------------+----------+--------- + // input NaN precedence | 1*3 + 2 | 1*2 + 3 + // generated QNaN | positive | negative + // + // Dragon Ball: Revenge of King Piccolo requires generated NaNs + // to be positive, so we'll have to handle them manually. + + if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bAccurateNaNs) + { + if (xmm_out != xmm) + MOVAPD(xmm_out, R(xmm)); + return; + } + + _assert_(xmm != XMM0); + + std::vector inputs; + u32 a = inst.FA, b = inst.FB, c = inst.FC; + for (u32 i : {a, b, c}) + { + if (!js.op->fregsIn[i]) + continue; + if (std::find(inputs.begin(), inputs.end(), i) == inputs.end()) + inputs.push_back(i); + } + if (inst.OPCD != 4) + { + // not paired-single + UCOMISD(xmm, R(xmm)); + FixupBranch handle_nan = J_CC(CC_P, true); + SwitchToFarCode(); + SetJumpTarget(handle_nan); + std::vector fixups; + for (u32 x : inputs) + { + MOVDDUP(xmm, fpr.R(x)); + UCOMISD(xmm, R(xmm)); + fixups.push_back(J_CC(CC_P)); + } + MOVDDUP(xmm, M(psGeneratedQNaN)); + for (FixupBranch fixup : fixups) + SetJumpTarget(fixup); + FixupBranch done = J(true); + SwitchToNearCode(); + SetJumpTarget(done); + } + else + { + // paired-single + std::reverse(inputs.begin(), inputs.end()); + if (cpu_info.bSSE4_1) + { + avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, R(xmm), R(xmm), CMP_UNORD); + PTEST(XMM0, R(XMM0)); + FixupBranch handle_nan = J_CC(CC_NZ, true); + SwitchToFarCode(); + SetJumpTarget(handle_nan); + BLENDVPD(xmm, M(psGeneratedQNaN)); + for (u32 x : inputs) + { + avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, fpr.R(x), fpr.R(x), CMP_UNORD); + BLENDVPD(xmm, fpr.R(x)); + } + FixupBranch done = J(true); + SwitchToNearCode(); + SetJumpTarget(done); + } + else + { + // SSE2 fallback + X64Reg tmp = fpr.GetFreeXReg(); + fpr.FlushLockX(tmp); + MOVAPD(XMM0, R(xmm)); + CMPPD(XMM0, R(XMM0), CMP_UNORD); + MOVMSKPD(RSCRATCH, R(XMM0)); + TEST(32, R(RSCRATCH), R(RSCRATCH)); + FixupBranch handle_nan = J_CC(CC_NZ, true); + SwitchToFarCode(); + SetJumpTarget(handle_nan); + MOVAPD(tmp, R(XMM0)); + PANDN(XMM0, R(xmm)); + PAND(tmp, M(psGeneratedQNaN)); + POR(tmp, R(XMM0)); + MOVAPD(xmm, R(tmp)); + for (u32 x : inputs) + { + MOVAPD(XMM0, fpr.R(x)); + CMPPD(XMM0, R(XMM0), CMP_ORD); + MOVAPD(tmp, R(XMM0)); + PANDN(XMM0, fpr.R(x)); + PAND(xmm, R(tmp)); + POR(xmm, R(XMM0)); + } + FixupBranch done = J(true); + SwitchToNearCode(); + SetJumpTarget(done); + fpr.UnlockX(tmp); + } + } + if (xmm_out != xmm) + MOVAPD(xmm_out, R(xmm)); +} + void Jit64::fp_arith(UGeckoInstruction inst) { INSTRUCTION_START @@ -80,20 +185,27 @@ void Jit64::fp_arith(UGeckoInstruction inst) packed = false; bool round_input = single && !jit->js.op->fprIsSingle[inst.FC]; + bool preserve_inputs = SConfig::GetInstance().m_LocalCoreStartupParameter.bAccurateNaNs; + X64Reg dest = INVALID_REG; switch (inst.SUBOP5) { - case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD, - packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed); break; - case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD, - packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed); break; - case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD, - packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed); break; - case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD, - packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, round_input); break; + case 18: dest = fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD, + packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed, preserve_inputs); break; + case 20: dest = fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD, + packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed, preserve_inputs); break; + case 21: dest = fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD, + packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed, preserve_inputs); break; + case 25: dest = fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD, + packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, preserve_inputs, round_input); break; default: _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } + HandleNaNs(inst, fpr.RX(d), dest); + if (single) + ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true); + SetFPRFIfNeeded(fpr.RX(d)); + fpr.UnlockAll(); } void Jit64::fmaddXX(UGeckoInstruction inst) @@ -220,13 +332,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst) if (inst.SUBOP5 == 31) //nmadd PXOR(XMM1, M(packed ? psSignBits2 : psSignBits)); } - fpr.BindToRegister(d, !single); - if (single) - ForceSinglePrecision(fpr.RX(d), R(XMM1), packed, true); + { + HandleNaNs(inst, fpr.RX(d), XMM1); + ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true); + } else + { + HandleNaNs(inst, XMM1, XMM1); MOVSD(fpr.RX(d), R(XMM1)); + } SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } @@ -379,7 +495,6 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper) } else { - // Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception? UCOMISD(fpr.RX(b), fpr.R(a)); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 3e2b855f48..a985d265ac 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -38,7 +38,7 @@ void Jit64::ps_sum(UGeckoInstruction inst) fpr.Lock(a, b, c, d); OpArg op_a = fpr.R(a); fpr.BindToRegister(d, d == b || d == c); - X64Reg tmp = XMM0; + X64Reg tmp = XMM1; MOVDDUP(tmp, op_a); // {a.ps0, a.ps0} ADDPD(tmp, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1} switch (inst.SUBOP5) @@ -55,9 +55,9 @@ void Jit64::ps_sum(UGeckoInstruction inst) } else { - MOVAPD(XMM1, fpr.R(c)); - SHUFPD(XMM1, R(tmp), 2); - tmp = XMM1; + MOVAPD(XMM0, fpr.R(c)); + SHUFPD(XMM0, R(tmp), 2); + tmp = XMM0; } } else @@ -68,7 +68,8 @@ void Jit64::ps_sum(UGeckoInstruction inst) default: PanicAlert("ps_sum WTF!!!"); } - ForceSinglePrecision(fpr.RX(d), R(tmp)); + HandleNaNs(inst, fpr.RX(d), tmp); + ForceSinglePrecision(fpr.RX(d), fpr.R(d)); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } @@ -88,19 +89,20 @@ void Jit64::ps_muls(UGeckoInstruction inst) switch (inst.SUBOP5) { case 12: // ps_muls0 - MOVDDUP(XMM0, fpr.R(c)); + MOVDDUP(XMM1, fpr.R(c)); break; case 13: // ps_muls1 - avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, fpr.R(c), fpr.R(c), 3); break; default: PanicAlert("ps_muls WTF!!!"); } if (round_input) - Force25BitPrecision(XMM0, R(XMM0), XMM1); - MULPD(XMM0, fpr.R(a)); + Force25BitPrecision(XMM1, R(XMM1), XMM0); + MULPD(XMM1, fpr.R(a)); fpr.BindToRegister(d, false); - ForceSinglePrecision(fpr.RX(d), R(XMM0)); + HandleNaNs(inst, fpr.RX(d), XMM1); + ForceSinglePrecision(fpr.RX(d), fpr.R(d)); SetFPRFIfNeeded(fpr.RX(d)); fpr.UnlockAll(); } From 77685df23fbe9193b590bd26ae820d8107f75498 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Sun, 7 Jun 2015 23:00:26 +0200 Subject: [PATCH 2/2] Enable AccurateNaNs for "Dragon Ball: RKP" --- Data/Sys/GameSettings/R7G.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/Data/Sys/GameSettings/R7G.ini b/Data/Sys/GameSettings/R7G.ini index 6b414d00c0..c999355755 100644 --- a/Data/Sys/GameSettings/R7G.ini +++ b/Data/Sys/GameSettings/R7G.ini @@ -2,6 +2,7 @@ [Core] # Values set here will override the main Dolphin settings. +AccurateNaNs = True [EmuState] # The Emulation State. 1 is worst, 5 is best, 0 is not set.