diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h index af4bc6dd3d..d3a4487c4c 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h @@ -94,6 +94,13 @@ inline double ForceDouble(double d) return d; } +inline double Force25Bit(double d) +{ + u64 di = *(u64*)&d; + di = (di & 0xFFFFFFFFF8000000ULL) + (di & 0x8000000); + return *(double*)&di; +} + // these functions allow globally modify operations behaviour // also, these may be used to set flags like FR, FI, OX, UX diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp index d3df8716f5..f0ef426e5f 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp @@ -297,7 +297,8 @@ void Interpreter::fmulx(UGeckoInstruction _inst) } void Interpreter::fmulsx(UGeckoInstruction _inst) { - double d_value = NI_mul(rPS0(_inst.FA), rPS0(_inst.FC)); + double c_value = Force25Bit(rPS0(_inst.FC)); + double d_value = NI_mul(rPS0(_inst.FA), c_value); rPS0(_inst.FD) = rPS1(_inst.FD) = ForceSingle(d_value); //FPSCR.FI = d_value != rPS0(_inst.FD); FPSCR.FI = 0; @@ -320,7 +321,8 @@ void Interpreter::fmaddx(UGeckoInstruction _inst) void Interpreter::fmaddsx(UGeckoInstruction _inst) { - double d_value = NI_madd( rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB) ); + double c_value = Force25Bit(rPS0(_inst.FC)); + double d_value = NI_madd(rPS0(_inst.FA), c_value, rPS0(_inst.FB)); rPS0(_inst.FD) = rPS1(_inst.FD) = ForceSingle(d_value); FPSCR.FI = d_value != rPS0(_inst.FD); FPSCR.FR = 0; diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp index bf47c351dc..0888e79042 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp @@ -264,8 +264,10 @@ void Interpreter::ps_add(UGeckoInstruction _inst) void Interpreter::ps_mul(UGeckoInstruction _inst) { - rPS0(_inst.FD) = ForceSingle(NI_mul(rPS0(_inst.FA), rPS0(_inst.FC))); - rPS1(_inst.FD) = ForceSingle(NI_mul(rPS1(_inst.FA), rPS1(_inst.FC))); + double c0 = Force25Bit(rPS0(_inst.FC)); + double c1 = Force25Bit(rPS1(_inst.FC)); + rPS0(_inst.FD) = ForceSingle(NI_mul(rPS0(_inst.FA), c0)); + rPS1(_inst.FD) = ForceSingle(NI_mul(rPS1(_inst.FA), c1)); UpdateFPRF(rPS0(_inst.FD)); if (_inst.Rc) @@ -275,8 +277,10 @@ void Interpreter::ps_mul(UGeckoInstruction _inst) void Interpreter::ps_msub(UGeckoInstruction _inst) { - rPS0(_inst.FD) = ForceSingle(NI_msub(rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB))); - rPS1(_inst.FD) = ForceSingle(NI_msub(rPS1(_inst.FA), rPS1(_inst.FC), rPS1(_inst.FB))); + double c0 = Force25Bit(rPS0(_inst.FC)); + double c1 = Force25Bit(rPS1(_inst.FC)); + rPS0(_inst.FD) = ForceSingle(NI_msub(rPS0(_inst.FA), c0, rPS0(_inst.FB))); + rPS1(_inst.FD) = ForceSingle(NI_msub(rPS1(_inst.FA), c1, rPS1(_inst.FB))); UpdateFPRF(rPS0(_inst.FD)); if (_inst.Rc) @@ -285,8 +289,10 @@ void Interpreter::ps_msub(UGeckoInstruction _inst) void Interpreter::ps_madd(UGeckoInstruction _inst) { - rPS0(_inst.FD) = ForceSingle(NI_madd(rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB))); - rPS1(_inst.FD) = ForceSingle(NI_madd(rPS1(_inst.FA), rPS1(_inst.FC), rPS1(_inst.FB))); + double c0 = Force25Bit(rPS0(_inst.FC)); + double c1 = Force25Bit(rPS1(_inst.FC)); + rPS0(_inst.FD) = ForceSingle(NI_madd(rPS0(_inst.FA), c0, rPS0(_inst.FB))); + rPS1(_inst.FD) = ForceSingle(NI_madd(rPS1(_inst.FA), c1, rPS1(_inst.FB))); UpdateFPRF(rPS0(_inst.FD)); if (_inst.Rc) @@ -295,8 +301,10 @@ void Interpreter::ps_madd(UGeckoInstruction _inst) void Interpreter::ps_nmsub(UGeckoInstruction _inst) { - rPS0(_inst.FD) = ForceSingle( -NI_msub( rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB) ) ); - rPS1(_inst.FD) = ForceSingle( -NI_msub( rPS1(_inst.FA), rPS1(_inst.FC), rPS1(_inst.FB) ) ); + double c0 = Force25Bit(rPS0(_inst.FC)); + double c1 = Force25Bit(rPS1(_inst.FC)); + rPS0(_inst.FD) = ForceSingle(-NI_msub(rPS0(_inst.FA), c0, rPS0(_inst.FB))); + rPS1(_inst.FD) = ForceSingle(-NI_msub(rPS1(_inst.FA), c1, rPS1(_inst.FB))); UpdateFPRF(rPS0(_inst.FD)); if (_inst.Rc) @@ -305,8 +313,10 @@ void Interpreter::ps_nmsub(UGeckoInstruction _inst) void Interpreter::ps_nmadd(UGeckoInstruction _inst) { - rPS0(_inst.FD) = ForceSingle( -NI_madd( rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB) ) ); - rPS1(_inst.FD) = ForceSingle( -NI_madd( rPS1(_inst.FA), rPS1(_inst.FC), rPS1(_inst.FB) ) ); + double c0 = Force25Bit(rPS0(_inst.FC)); + double c1 = Force25Bit(rPS1(_inst.FC)); + rPS0(_inst.FD) = ForceSingle(-NI_madd(rPS0(_inst.FA), c0, rPS0(_inst.FB))); + rPS1(_inst.FD) = ForceSingle(-NI_madd(rPS1(_inst.FA), c1, rPS1(_inst.FB))); UpdateFPRF(rPS0(_inst.FD)); if (_inst.Rc) @@ -339,8 +349,9 @@ void Interpreter::ps_sum1(UGeckoInstruction _inst) void Interpreter::ps_muls0(UGeckoInstruction _inst) { - double p0 = ForceSingle(NI_mul(rPS0(_inst.FA), rPS0(_inst.FC))); - double p1 = ForceSingle(NI_mul(rPS1(_inst.FA), rPS0(_inst.FC))); + double c0 = Force25Bit(rPS1(_inst.FC)); + double p0 = ForceSingle(NI_mul(rPS0(_inst.FA), c0)); + double p1 = ForceSingle(NI_mul(rPS1(_inst.FA), c0)); rPS0(_inst.FD) = p0; rPS1(_inst.FD) = p1; UpdateFPRF(rPS0(_inst.FD)); @@ -351,8 +362,9 @@ void Interpreter::ps_muls0(UGeckoInstruction _inst) void Interpreter::ps_muls1(UGeckoInstruction _inst) { - double p0 = ForceSingle(NI_mul(rPS0(_inst.FA), rPS1(_inst.FC))); - double p1 = ForceSingle(NI_mul(rPS1(_inst.FA), rPS1(_inst.FC))); + double c1 = Force25Bit(rPS1(_inst.FC)); + double p0 = ForceSingle(NI_mul(rPS0(_inst.FA), c1)); + double p1 = ForceSingle(NI_mul(rPS1(_inst.FA), c1)); rPS0(_inst.FD) = p0; rPS1(_inst.FD) = p1; UpdateFPRF(rPS0(_inst.FD)); @@ -363,8 +375,10 @@ void Interpreter::ps_muls1(UGeckoInstruction _inst) void Interpreter::ps_madds0(UGeckoInstruction _inst) { - double p0 = ForceSingle( NI_madd( rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB)) ); - double p1 = ForceSingle( NI_madd( rPS1(_inst.FA), rPS0(_inst.FC), rPS1(_inst.FB)) ); + double c0 = Force25Bit(rPS0(_inst.FC)); + double c1 = Force25Bit(rPS1(_inst.FC)); + double p0 = ForceSingle(NI_madd(rPS0(_inst.FA), c0, rPS0(_inst.FB))); + double p1 = ForceSingle(NI_madd(rPS1(_inst.FA), c1, rPS1(_inst.FB))); rPS0(_inst.FD) = p0; rPS1(_inst.FD) = p1; UpdateFPRF(rPS0(_inst.FD)); @@ -375,8 +389,10 @@ void Interpreter::ps_madds0(UGeckoInstruction _inst) void Interpreter::ps_madds1(UGeckoInstruction _inst) { - double p0 = ForceSingle( NI_madd( rPS0(_inst.FA), rPS1(_inst.FC), rPS0(_inst.FB)) ); - double p1 = ForceSingle( NI_madd( rPS1(_inst.FA), rPS1(_inst.FC), rPS1(_inst.FB)) ); + double c0 = Force25Bit(rPS0(_inst.FC)); + double c1 = Force25Bit(rPS1(_inst.FC)); + double p0 = ForceSingle(NI_madd(rPS0(_inst.FA), c0, rPS0(_inst.FB))); + double p1 = ForceSingle(NI_madd(rPS1(_inst.FA), c1, rPS1(_inst.FB))); rPS0(_inst.FD) = p0; rPS1(_inst.FD) = p1; UpdateFPRF(rPS0(_inst.FD)); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 95fbcd1f7b..97aa522d22 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -117,10 +117,10 @@ public: // is set or not. Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); - void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg)); + void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false); typedef u32 (*Operation)(u32 a, u32 b); void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); - void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg)); + void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false); // OPCODES void unknown_instruction(UGeckoInstruction _inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 7761c636d2..6be6680708 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -14,10 +14,28 @@ static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x800000 static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; -void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg)) +void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS) { fpr.Lock(d, a, b); - if (d == a) + if (roundRHS) + { + if (d == a) + { + fpr.BindToRegister(d, true); + MOVSD(XMM0, fpr.R(b)); + Force25BitPrecision(XMM0, XMM1); + (this->*op)(fpr.RX(d), R(XMM0)); + } + else + { + fpr.BindToRegister(d, d == b); + if (d != b) + MOVSD(fpr.RX(d), fpr.R(b)); + Force25BitPrecision(fpr.RX(d), XMM0); + (this->*op)(fpr.RX(d), fpr.R(a)); + } + } + else if (d == a) { fpr.BindToRegister(d, true); if (!single) @@ -88,7 +106,7 @@ void Jit64::fp_arith(UGeckoInstruction inst) case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD); break; //div case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD); break; //sub case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD); break; //add - case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD); break; //mul + case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, single); break; //mul default: _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } @@ -111,24 +129,25 @@ void Jit64::fmaddXX(UGeckoInstruction inst) int d = inst.FD; fpr.Lock(a, b, c, d); - MOVSD(XMM0, fpr.R(a)); + MOVSD(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, XMM1); switch (inst.SUBOP5) { case 28: //msub - MULSD(XMM0, fpr.R(c)); + MULSD(XMM0, fpr.R(a)); SUBSD(XMM0, fpr.R(b)); break; case 29: //madd - MULSD(XMM0, fpr.R(c)); + MULSD(XMM0, fpr.R(a)); ADDSD(XMM0, fpr.R(b)); break; case 30: //nmsub - MULSD(XMM0, fpr.R(c)); + MULSD(XMM0, fpr.R(a)); SUBSD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits2)); break; case 31: //nmadd - MULSD(XMM0, fpr.R(c)); + MULSD(XMM0, fpr.R(a)); ADDSD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits2)); break; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 6934f56d42..b2fc5e4150 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -113,11 +113,29 @@ add a,b,a */ //There's still a little bit more optimization that can be squeezed out of this -void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg)) +void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), bool roundRHS) { fpr.Lock(d, a, b); - if (d == a) + if (roundRHS) + { + if (d == a) + { + fpr.BindToRegister(d, true); + MOVAPD(XMM0, fpr.R(b)); + Force25BitPrecision(XMM0, XMM1); + (this->*op)(fpr.RX(d), R(XMM0)); + } + else + { + fpr.BindToRegister(d, d == b); + if (d != b) + MOVAPD(fpr.RX(d), fpr.R(b)); + Force25BitPrecision(fpr.RX(d), XMM0); + (this->*op)(fpr.RX(d), fpr.R(a)); + } + } + else if (d == a) { fpr.BindToRegister(d, true); (this->*op)(fpr.RX(d), fpr.R(b)); @@ -166,7 +184,7 @@ void Jit64::ps_arith(UGeckoInstruction inst) tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD); break; case 25: // mul - tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD); + tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, true); break; default: _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); @@ -230,15 +248,17 @@ void Jit64::ps_muls(UGeckoInstruction inst) case 12: // Single multiply scalar high // TODO - faster version for when regs are different - MOVAPD(XMM0, fpr.R(a)); MOVDDUP(XMM1, fpr.R(c)); + Force25BitPrecision(XMM1, XMM0); + MOVAPD(XMM0, fpr.R(a)); MULPD(XMM0, R(XMM1)); MOVAPD(fpr.R(d), XMM0); break; case 13: // TODO - faster version for when regs are different - MOVAPD(XMM0, fpr.R(a)); MOVAPD(XMM1, fpr.R(c)); + Force25BitPrecision(XMM1, XMM0); + MOVAPD(XMM0, fpr.R(a)); SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower MULPD(XMM0, R(XMM1)); MOVAPD(fpr.R(d), XMM0); @@ -300,35 +320,46 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) int d = inst.FD; fpr.Lock(a,b,c,d); - MOVAPD(XMM0, fpr.R(a)); switch (inst.SUBOP5) { case 14: //madds0 MOVDDUP(XMM1, fpr.R(c)); + Force25BitPrecision(XMM1, XMM0); + MOVAPD(XMM0, fpr.R(a)); MULPD(XMM0, R(XMM1)); ADDPD(XMM0, fpr.R(b)); break; case 15: //madds1 MOVAPD(XMM1, fpr.R(c)); SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower + Force25BitPrecision(XMM1, XMM0); + MOVAPD(XMM0, fpr.R(a)); MULPD(XMM0, R(XMM1)); ADDPD(XMM0, fpr.R(b)); break; case 28: //msub - MULPD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, XMM1); + MULPD(XMM0, fpr.R(a)); SUBPD(XMM0, fpr.R(b)); break; case 29: //madd - MULPD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, XMM1); + MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); break; case 30: //nmsub - MULPD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, XMM1); + MULPD(XMM0, fpr.R(a)); SUBPD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits)); break; case 31: //nmadd - MULPD(XMM0, fpr.R(c)); + MOVAPD(XMM0, fpr.R(c)); + Force25BitPrecision(XMM0, XMM1); + MULPD(XMM0, fpr.R(a)); ADDPD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits)); break; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index e7d51213bb..9aa58b7f5b 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -519,6 +519,25 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm) } } +static const u64 GC_ALIGNED16(psMantissaTruncate[2]) = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL}; +static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000}; + +// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before +// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa, +// rounding to nearest as it does. +// It needs a temp, so let the caller pass that in. +void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp) +{ + if (jit->jo.accurateSinglePrecision) + { + // mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1); + MOVAPD(tmp, R(xmm)); + PAND(xmm, M((void*)&psMantissaTruncate)); + PAND(tmp, M((void*)&psRoundBit)); + PADDQ(xmm, R(tmp)); + } +} + static u32 GC_ALIGNED16(temp32); static u64 GC_ALIGNED16(temp64); diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index af68493571..22bd922d30 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -56,6 +56,7 @@ public: void ForceSinglePrecisionS(Gen::X64Reg xmm); void ForceSinglePrecisionP(Gen::X64Reg xmm); + void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp); // EAX might get trashed void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);