JIT64: Fix fmul rounding issues

Thanks to magumagu's softfp experiments, we know a lot more about the Wii's
strange floating point unit than we used to. In particular, when doing a
single-precision floating point multiply (fmulsx), it rounds the right hand
side's mantissa so as to lose the low 28 bits (of the 53-bit mantissa).

Emulating this behavior in Dolphin fixes a bunch of issues with games that
require extremely precise emulation of floating point hardware, especially
game replays. Fortunately, we can do this with rather little CPU cost; just ~5
extra instructions per multiply, instead of the vast load of a pure-software
float implementation.

This doesn't make floating-point behavior at all perfect. I still suspect
fmadd rounding might not be quite right, since the Wii uses fused instructions
and Dolphin doesn't, and NaN/infinity/exception handling is probably off in
various ways... but it's definitely way better than before.

This appears to fix replays in Mario Kart Wii, Mario Kart Double Dash, and
Super Smash Brothers Brawl. I wouldn't be surprised if it fixes a bunch of
other stuff too.

The changes to instructions other than fmulsx may not be strictly necessary,
but I included them for completeness, since it feels wrong to fix some
instructions but not others, since some games we didn't test might rely on
them.
This commit is contained in:
Fiora 2014-08-18 19:58:00 -07:00
parent b749a74d33
commit 4f18f6078f
5 changed files with 90 additions and 20 deletions

View File

@ -117,10 +117,10 @@ public:
// is set or not.
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
typedef u32 (*Operation)(u32 a, u32 b);
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
// OPCODES
void unknown_instruction(UGeckoInstruction _inst);

View File

@ -14,10 +14,28 @@ static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x800000
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS)
{
fpr.Lock(d, a, b);
if (d == a)
if (roundRHS)
{
if (d == a)
{
fpr.BindToRegister(d, true);
MOVSD(XMM0, fpr.R(b));
Force25BitPrecision(XMM0, XMM1);
(this->*op)(fpr.RX(d), R(XMM0));
}
else
{
fpr.BindToRegister(d, d == b);
if (d != b)
MOVSD(fpr.RX(d), fpr.R(b));
Force25BitPrecision(fpr.RX(d), XMM0);
(this->*op)(fpr.RX(d), fpr.R(a));
}
}
else if (d == a)
{
fpr.BindToRegister(d, true);
if (!single)
@ -88,7 +106,7 @@ void Jit64::fp_arith(UGeckoInstruction inst)
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD); break; //add
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD); break; //mul
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, single); break; //mul
default:
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
}
@ -111,24 +129,25 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
int d = inst.FD;
fpr.Lock(a, b, c, d);
MOVSD(XMM0, fpr.R(a));
MOVSD(XMM0, fpr.R(c));
Force25BitPrecision(XMM0, XMM1);
switch (inst.SUBOP5)
{
case 28: //msub
MULSD(XMM0, fpr.R(c));
MULSD(XMM0, fpr.R(a));
SUBSD(XMM0, fpr.R(b));
break;
case 29: //madd
MULSD(XMM0, fpr.R(c));
MULSD(XMM0, fpr.R(a));
ADDSD(XMM0, fpr.R(b));
break;
case 30: //nmsub
MULSD(XMM0, fpr.R(c));
MULSD(XMM0, fpr.R(a));
SUBSD(XMM0, fpr.R(b));
PXOR(XMM0, M((void*)&psSignBits2));
break;
case 31: //nmadd
MULSD(XMM0, fpr.R(c));
MULSD(XMM0, fpr.R(a));
ADDSD(XMM0, fpr.R(b));
PXOR(XMM0, M((void*)&psSignBits2));
break;

View File

@ -113,11 +113,29 @@ add a,b,a
*/
//There's still a little bit more optimization that can be squeezed out of this
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg))
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), bool roundRHS)
{
fpr.Lock(d, a, b);
if (d == a)
if (roundRHS)
{
if (d == a)
{
fpr.BindToRegister(d, true);
MOVAPD(XMM0, fpr.R(b));
Force25BitPrecision(XMM0, XMM1);
(this->*op)(fpr.RX(d), R(XMM0));
}
else
{
fpr.BindToRegister(d, d == b);
if (d != b)
MOVAPD(fpr.RX(d), fpr.R(b));
Force25BitPrecision(fpr.RX(d), XMM0);
(this->*op)(fpr.RX(d), fpr.R(a));
}
}
else if (d == a)
{
fpr.BindToRegister(d, true);
(this->*op)(fpr.RX(d), fpr.R(b));
@ -166,7 +184,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD);
break;
case 25: // mul
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD);
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, true);
break;
default:
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
@ -230,15 +248,17 @@ void Jit64::ps_muls(UGeckoInstruction inst)
case 12:
// Single multiply scalar high
// TODO - faster version for when regs are different
MOVAPD(XMM0, fpr.R(a));
MOVDDUP(XMM1, fpr.R(c));
Force25BitPrecision(XMM1, XMM0);
MOVAPD(XMM0, fpr.R(a));
MULPD(XMM0, R(XMM1));
MOVAPD(fpr.R(d), XMM0);
break;
case 13:
// TODO - faster version for when regs are different
MOVAPD(XMM0, fpr.R(a));
MOVAPD(XMM1, fpr.R(c));
Force25BitPrecision(XMM1, XMM0);
MOVAPD(XMM0, fpr.R(a));
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
MULPD(XMM0, R(XMM1));
MOVAPD(fpr.R(d), XMM0);
@ -300,35 +320,46 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
int d = inst.FD;
fpr.Lock(a,b,c,d);
MOVAPD(XMM0, fpr.R(a));
switch (inst.SUBOP5)
{
case 14: //madds0
MOVDDUP(XMM1, fpr.R(c));
Force25BitPrecision(XMM1, XMM0);
MOVAPD(XMM0, fpr.R(a));
MULPD(XMM0, R(XMM1));
ADDPD(XMM0, fpr.R(b));
break;
case 15: //madds1
MOVAPD(XMM1, fpr.R(c));
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
Force25BitPrecision(XMM1, XMM0);
MOVAPD(XMM0, fpr.R(a));
MULPD(XMM0, R(XMM1));
ADDPD(XMM0, fpr.R(b));
break;
case 28: //msub
MULPD(XMM0, fpr.R(c));
MOVAPD(XMM0, fpr.R(c));
Force25BitPrecision(XMM0, XMM1);
MULPD(XMM0, fpr.R(a));
SUBPD(XMM0, fpr.R(b));
break;
case 29: //madd
MULPD(XMM0, fpr.R(c));
MOVAPD(XMM0, fpr.R(c));
Force25BitPrecision(XMM0, XMM1);
MULPD(XMM0, fpr.R(a));
ADDPD(XMM0, fpr.R(b));
break;
case 30: //nmsub
MULPD(XMM0, fpr.R(c));
MOVAPD(XMM0, fpr.R(c));
Force25BitPrecision(XMM0, XMM1);
MULPD(XMM0, fpr.R(a));
SUBPD(XMM0, fpr.R(b));
PXOR(XMM0, M((void*)&psSignBits));
break;
case 31: //nmadd
MULPD(XMM0, fpr.R(c));
MOVAPD(XMM0, fpr.R(c));
Force25BitPrecision(XMM0, XMM1);
MULPD(XMM0, fpr.R(a));
ADDPD(XMM0, fpr.R(b));
PXOR(XMM0, M((void*)&psSignBits));
break;

View File

@ -519,6 +519,25 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm)
}
}
static const u64 GC_ALIGNED16(psMantissaTruncate[2]) = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL};
static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000};
// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
// rounding to nearest as it does.
// It needs a temp, so let the caller pass that in.
void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp)
{
if (jit->jo.accurateSinglePrecision)
{
// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
MOVAPD(tmp, R(xmm));
PAND(xmm, M((void*)&psMantissaTruncate));
PAND(tmp, M((void*)&psRoundBit));
PADDQ(xmm, R(tmp));
}
}
static u32 GC_ALIGNED16(temp32);
static u64 GC_ALIGNED16(temp64);

View File

@ -56,6 +56,7 @@ public:
void ForceSinglePrecisionS(Gen::X64Reg xmm);
void ForceSinglePrecisionP(Gen::X64Reg xmm);
void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp);
// EAX might get trashed
void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);