Merge pull request #834 from FioraAeterna/fixfmulrounding
JIT64: Fix fmul rounding issues
This commit is contained in:
commit
ebf1b98106
|
@ -94,6 +94,13 @@ inline double ForceDouble(double d)
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline double Force25Bit(double d)
|
||||||
|
{
|
||||||
|
u64 di = *(u64*)&d;
|
||||||
|
di = (di & 0xFFFFFFFFF8000000ULL) + (di & 0x8000000);
|
||||||
|
return *(double*)&di;
|
||||||
|
}
|
||||||
|
|
||||||
// these functions allow globally modify operations behaviour
|
// these functions allow globally modify operations behaviour
|
||||||
// also, these may be used to set flags like FR, FI, OX, UX
|
// also, these may be used to set flags like FR, FI, OX, UX
|
||||||
|
|
||||||
|
|
|
@ -297,7 +297,8 @@ void Interpreter::fmulx(UGeckoInstruction _inst)
|
||||||
}
|
}
|
||||||
void Interpreter::fmulsx(UGeckoInstruction _inst)
|
void Interpreter::fmulsx(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
double d_value = NI_mul(rPS0(_inst.FA), rPS0(_inst.FC));
|
double c_value = Force25Bit(rPS0(_inst.FC));
|
||||||
|
double d_value = NI_mul(rPS0(_inst.FA), c_value);
|
||||||
rPS0(_inst.FD) = rPS1(_inst.FD) = ForceSingle(d_value);
|
rPS0(_inst.FD) = rPS1(_inst.FD) = ForceSingle(d_value);
|
||||||
//FPSCR.FI = d_value != rPS0(_inst.FD);
|
//FPSCR.FI = d_value != rPS0(_inst.FD);
|
||||||
FPSCR.FI = 0;
|
FPSCR.FI = 0;
|
||||||
|
@ -320,7 +321,8 @@ void Interpreter::fmaddx(UGeckoInstruction _inst)
|
||||||
|
|
||||||
void Interpreter::fmaddsx(UGeckoInstruction _inst)
|
void Interpreter::fmaddsx(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
double d_value = NI_madd( rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB) );
|
double c_value = Force25Bit(rPS0(_inst.FC));
|
||||||
|
double d_value = NI_madd(rPS0(_inst.FA), c_value, rPS0(_inst.FB));
|
||||||
rPS0(_inst.FD) = rPS1(_inst.FD) = ForceSingle(d_value);
|
rPS0(_inst.FD) = rPS1(_inst.FD) = ForceSingle(d_value);
|
||||||
FPSCR.FI = d_value != rPS0(_inst.FD);
|
FPSCR.FI = d_value != rPS0(_inst.FD);
|
||||||
FPSCR.FR = 0;
|
FPSCR.FR = 0;
|
||||||
|
|
|
@ -264,8 +264,10 @@ void Interpreter::ps_add(UGeckoInstruction _inst)
|
||||||
|
|
||||||
void Interpreter::ps_mul(UGeckoInstruction _inst)
|
void Interpreter::ps_mul(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
rPS0(_inst.FD) = ForceSingle(NI_mul(rPS0(_inst.FA), rPS0(_inst.FC)));
|
double c0 = Force25Bit(rPS0(_inst.FC));
|
||||||
rPS1(_inst.FD) = ForceSingle(NI_mul(rPS1(_inst.FA), rPS1(_inst.FC)));
|
double c1 = Force25Bit(rPS1(_inst.FC));
|
||||||
|
rPS0(_inst.FD) = ForceSingle(NI_mul(rPS0(_inst.FA), c0));
|
||||||
|
rPS1(_inst.FD) = ForceSingle(NI_mul(rPS1(_inst.FA), c1));
|
||||||
UpdateFPRF(rPS0(_inst.FD));
|
UpdateFPRF(rPS0(_inst.FD));
|
||||||
|
|
||||||
if (_inst.Rc)
|
if (_inst.Rc)
|
||||||
|
@ -275,8 +277,10 @@ void Interpreter::ps_mul(UGeckoInstruction _inst)
|
||||||
|
|
||||||
void Interpreter::ps_msub(UGeckoInstruction _inst)
|
void Interpreter::ps_msub(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
rPS0(_inst.FD) = ForceSingle(NI_msub(rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB)));
|
double c0 = Force25Bit(rPS0(_inst.FC));
|
||||||
rPS1(_inst.FD) = ForceSingle(NI_msub(rPS1(_inst.FA), rPS1(_inst.FC), rPS1(_inst.FB)));
|
double c1 = Force25Bit(rPS1(_inst.FC));
|
||||||
|
rPS0(_inst.FD) = ForceSingle(NI_msub(rPS0(_inst.FA), c0, rPS0(_inst.FB)));
|
||||||
|
rPS1(_inst.FD) = ForceSingle(NI_msub(rPS1(_inst.FA), c1, rPS1(_inst.FB)));
|
||||||
UpdateFPRF(rPS0(_inst.FD));
|
UpdateFPRF(rPS0(_inst.FD));
|
||||||
|
|
||||||
if (_inst.Rc)
|
if (_inst.Rc)
|
||||||
|
@ -285,8 +289,10 @@ void Interpreter::ps_msub(UGeckoInstruction _inst)
|
||||||
|
|
||||||
void Interpreter::ps_madd(UGeckoInstruction _inst)
|
void Interpreter::ps_madd(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
rPS0(_inst.FD) = ForceSingle(NI_madd(rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB)));
|
double c0 = Force25Bit(rPS0(_inst.FC));
|
||||||
rPS1(_inst.FD) = ForceSingle(NI_madd(rPS1(_inst.FA), rPS1(_inst.FC), rPS1(_inst.FB)));
|
double c1 = Force25Bit(rPS1(_inst.FC));
|
||||||
|
rPS0(_inst.FD) = ForceSingle(NI_madd(rPS0(_inst.FA), c0, rPS0(_inst.FB)));
|
||||||
|
rPS1(_inst.FD) = ForceSingle(NI_madd(rPS1(_inst.FA), c1, rPS1(_inst.FB)));
|
||||||
UpdateFPRF(rPS0(_inst.FD));
|
UpdateFPRF(rPS0(_inst.FD));
|
||||||
|
|
||||||
if (_inst.Rc)
|
if (_inst.Rc)
|
||||||
|
@ -295,8 +301,10 @@ void Interpreter::ps_madd(UGeckoInstruction _inst)
|
||||||
|
|
||||||
void Interpreter::ps_nmsub(UGeckoInstruction _inst)
|
void Interpreter::ps_nmsub(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
rPS0(_inst.FD) = ForceSingle( -NI_msub( rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB) ) );
|
double c0 = Force25Bit(rPS0(_inst.FC));
|
||||||
rPS1(_inst.FD) = ForceSingle( -NI_msub( rPS1(_inst.FA), rPS1(_inst.FC), rPS1(_inst.FB) ) );
|
double c1 = Force25Bit(rPS1(_inst.FC));
|
||||||
|
rPS0(_inst.FD) = ForceSingle(-NI_msub(rPS0(_inst.FA), c0, rPS0(_inst.FB)));
|
||||||
|
rPS1(_inst.FD) = ForceSingle(-NI_msub(rPS1(_inst.FA), c1, rPS1(_inst.FB)));
|
||||||
UpdateFPRF(rPS0(_inst.FD));
|
UpdateFPRF(rPS0(_inst.FD));
|
||||||
|
|
||||||
if (_inst.Rc)
|
if (_inst.Rc)
|
||||||
|
@ -305,8 +313,10 @@ void Interpreter::ps_nmsub(UGeckoInstruction _inst)
|
||||||
|
|
||||||
void Interpreter::ps_nmadd(UGeckoInstruction _inst)
|
void Interpreter::ps_nmadd(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
rPS0(_inst.FD) = ForceSingle( -NI_madd( rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB) ) );
|
double c0 = Force25Bit(rPS0(_inst.FC));
|
||||||
rPS1(_inst.FD) = ForceSingle( -NI_madd( rPS1(_inst.FA), rPS1(_inst.FC), rPS1(_inst.FB) ) );
|
double c1 = Force25Bit(rPS1(_inst.FC));
|
||||||
|
rPS0(_inst.FD) = ForceSingle(-NI_madd(rPS0(_inst.FA), c0, rPS0(_inst.FB)));
|
||||||
|
rPS1(_inst.FD) = ForceSingle(-NI_madd(rPS1(_inst.FA), c1, rPS1(_inst.FB)));
|
||||||
UpdateFPRF(rPS0(_inst.FD));
|
UpdateFPRF(rPS0(_inst.FD));
|
||||||
|
|
||||||
if (_inst.Rc)
|
if (_inst.Rc)
|
||||||
|
@ -339,8 +349,9 @@ void Interpreter::ps_sum1(UGeckoInstruction _inst)
|
||||||
|
|
||||||
void Interpreter::ps_muls0(UGeckoInstruction _inst)
|
void Interpreter::ps_muls0(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
double p0 = ForceSingle(NI_mul(rPS0(_inst.FA), rPS0(_inst.FC)));
|
double c0 = Force25Bit(rPS1(_inst.FC));
|
||||||
double p1 = ForceSingle(NI_mul(rPS1(_inst.FA), rPS0(_inst.FC)));
|
double p0 = ForceSingle(NI_mul(rPS0(_inst.FA), c0));
|
||||||
|
double p1 = ForceSingle(NI_mul(rPS1(_inst.FA), c0));
|
||||||
rPS0(_inst.FD) = p0;
|
rPS0(_inst.FD) = p0;
|
||||||
rPS1(_inst.FD) = p1;
|
rPS1(_inst.FD) = p1;
|
||||||
UpdateFPRF(rPS0(_inst.FD));
|
UpdateFPRF(rPS0(_inst.FD));
|
||||||
|
@ -351,8 +362,9 @@ void Interpreter::ps_muls0(UGeckoInstruction _inst)
|
||||||
|
|
||||||
void Interpreter::ps_muls1(UGeckoInstruction _inst)
|
void Interpreter::ps_muls1(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
double p0 = ForceSingle(NI_mul(rPS0(_inst.FA), rPS1(_inst.FC)));
|
double c1 = Force25Bit(rPS1(_inst.FC));
|
||||||
double p1 = ForceSingle(NI_mul(rPS1(_inst.FA), rPS1(_inst.FC)));
|
double p0 = ForceSingle(NI_mul(rPS0(_inst.FA), c1));
|
||||||
|
double p1 = ForceSingle(NI_mul(rPS1(_inst.FA), c1));
|
||||||
rPS0(_inst.FD) = p0;
|
rPS0(_inst.FD) = p0;
|
||||||
rPS1(_inst.FD) = p1;
|
rPS1(_inst.FD) = p1;
|
||||||
UpdateFPRF(rPS0(_inst.FD));
|
UpdateFPRF(rPS0(_inst.FD));
|
||||||
|
@ -363,8 +375,10 @@ void Interpreter::ps_muls1(UGeckoInstruction _inst)
|
||||||
|
|
||||||
void Interpreter::ps_madds0(UGeckoInstruction _inst)
|
void Interpreter::ps_madds0(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
double p0 = ForceSingle( NI_madd( rPS0(_inst.FA), rPS0(_inst.FC), rPS0(_inst.FB)) );
|
double c0 = Force25Bit(rPS0(_inst.FC));
|
||||||
double p1 = ForceSingle( NI_madd( rPS1(_inst.FA), rPS0(_inst.FC), rPS1(_inst.FB)) );
|
double c1 = Force25Bit(rPS1(_inst.FC));
|
||||||
|
double p0 = ForceSingle(NI_madd(rPS0(_inst.FA), c0, rPS0(_inst.FB)));
|
||||||
|
double p1 = ForceSingle(NI_madd(rPS1(_inst.FA), c1, rPS1(_inst.FB)));
|
||||||
rPS0(_inst.FD) = p0;
|
rPS0(_inst.FD) = p0;
|
||||||
rPS1(_inst.FD) = p1;
|
rPS1(_inst.FD) = p1;
|
||||||
UpdateFPRF(rPS0(_inst.FD));
|
UpdateFPRF(rPS0(_inst.FD));
|
||||||
|
@ -375,8 +389,10 @@ void Interpreter::ps_madds0(UGeckoInstruction _inst)
|
||||||
|
|
||||||
void Interpreter::ps_madds1(UGeckoInstruction _inst)
|
void Interpreter::ps_madds1(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
double p0 = ForceSingle( NI_madd( rPS0(_inst.FA), rPS1(_inst.FC), rPS0(_inst.FB)) );
|
double c0 = Force25Bit(rPS0(_inst.FC));
|
||||||
double p1 = ForceSingle( NI_madd( rPS1(_inst.FA), rPS1(_inst.FC), rPS1(_inst.FB)) );
|
double c1 = Force25Bit(rPS1(_inst.FC));
|
||||||
|
double p0 = ForceSingle(NI_madd(rPS0(_inst.FA), c0, rPS0(_inst.FB)));
|
||||||
|
double p1 = ForceSingle(NI_madd(rPS1(_inst.FA), c1, rPS1(_inst.FB)));
|
||||||
rPS0(_inst.FD) = p0;
|
rPS0(_inst.FD) = p0;
|
||||||
rPS1(_inst.FD) = p1;
|
rPS1(_inst.FD) = p1;
|
||||||
UpdateFPRF(rPS0(_inst.FD));
|
UpdateFPRF(rPS0(_inst.FD));
|
||||||
|
|
|
@ -117,10 +117,10 @@ public:
|
||||||
// is set or not.
|
// is set or not.
|
||||||
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
|
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
|
||||||
|
|
||||||
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
|
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
|
||||||
typedef u32 (*Operation)(u32 a, u32 b);
|
typedef u32 (*Operation)(u32 a, u32 b);
|
||||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
|
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
|
||||||
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
|
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
|
||||||
|
|
||||||
// OPCODES
|
// OPCODES
|
||||||
void unknown_instruction(UGeckoInstruction _inst);
|
void unknown_instruction(UGeckoInstruction _inst);
|
||||||
|
|
|
@ -14,10 +14,28 @@ static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x800000
|
||||||
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
||||||
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
||||||
|
|
||||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
|
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS)
|
||||||
{
|
{
|
||||||
fpr.Lock(d, a, b);
|
fpr.Lock(d, a, b);
|
||||||
if (d == a)
|
if (roundRHS)
|
||||||
|
{
|
||||||
|
if (d == a)
|
||||||
|
{
|
||||||
|
fpr.BindToRegister(d, true);
|
||||||
|
MOVSD(XMM0, fpr.R(b));
|
||||||
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
|
(this->*op)(fpr.RX(d), R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fpr.BindToRegister(d, d == b);
|
||||||
|
if (d != b)
|
||||||
|
MOVSD(fpr.RX(d), fpr.R(b));
|
||||||
|
Force25BitPrecision(fpr.RX(d), XMM0);
|
||||||
|
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (d == a)
|
||||||
{
|
{
|
||||||
fpr.BindToRegister(d, true);
|
fpr.BindToRegister(d, true);
|
||||||
if (!single)
|
if (!single)
|
||||||
|
@ -88,7 +106,7 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
||||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD); break; //div
|
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD); break; //div
|
||||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD); break; //sub
|
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD); break; //sub
|
||||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD); break; //add
|
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD); break; //add
|
||||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD); break; //mul
|
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, single); break; //mul
|
||||||
default:
|
default:
|
||||||
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
||||||
}
|
}
|
||||||
|
@ -111,24 +129,25 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
MOVSD(XMM0, fpr.R(a));
|
MOVSD(XMM0, fpr.R(c));
|
||||||
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 28: //msub
|
case 28: //msub
|
||||||
MULSD(XMM0, fpr.R(c));
|
MULSD(XMM0, fpr.R(a));
|
||||||
SUBSD(XMM0, fpr.R(b));
|
SUBSD(XMM0, fpr.R(b));
|
||||||
break;
|
break;
|
||||||
case 29: //madd
|
case 29: //madd
|
||||||
MULSD(XMM0, fpr.R(c));
|
MULSD(XMM0, fpr.R(a));
|
||||||
ADDSD(XMM0, fpr.R(b));
|
ADDSD(XMM0, fpr.R(b));
|
||||||
break;
|
break;
|
||||||
case 30: //nmsub
|
case 30: //nmsub
|
||||||
MULSD(XMM0, fpr.R(c));
|
MULSD(XMM0, fpr.R(a));
|
||||||
SUBSD(XMM0, fpr.R(b));
|
SUBSD(XMM0, fpr.R(b));
|
||||||
PXOR(XMM0, M((void*)&psSignBits2));
|
PXOR(XMM0, M((void*)&psSignBits2));
|
||||||
break;
|
break;
|
||||||
case 31: //nmadd
|
case 31: //nmadd
|
||||||
MULSD(XMM0, fpr.R(c));
|
MULSD(XMM0, fpr.R(a));
|
||||||
ADDSD(XMM0, fpr.R(b));
|
ADDSD(XMM0, fpr.R(b));
|
||||||
PXOR(XMM0, M((void*)&psSignBits2));
|
PXOR(XMM0, M((void*)&psSignBits2));
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -113,11 +113,29 @@ add a,b,a
|
||||||
*/
|
*/
|
||||||
|
|
||||||
//There's still a little bit more optimization that can be squeezed out of this
|
//There's still a little bit more optimization that can be squeezed out of this
|
||||||
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg))
|
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), bool roundRHS)
|
||||||
{
|
{
|
||||||
fpr.Lock(d, a, b);
|
fpr.Lock(d, a, b);
|
||||||
|
|
||||||
if (d == a)
|
if (roundRHS)
|
||||||
|
{
|
||||||
|
if (d == a)
|
||||||
|
{
|
||||||
|
fpr.BindToRegister(d, true);
|
||||||
|
MOVAPD(XMM0, fpr.R(b));
|
||||||
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
|
(this->*op)(fpr.RX(d), R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fpr.BindToRegister(d, d == b);
|
||||||
|
if (d != b)
|
||||||
|
MOVAPD(fpr.RX(d), fpr.R(b));
|
||||||
|
Force25BitPrecision(fpr.RX(d), XMM0);
|
||||||
|
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (d == a)
|
||||||
{
|
{
|
||||||
fpr.BindToRegister(d, true);
|
fpr.BindToRegister(d, true);
|
||||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||||
|
@ -166,7 +184,7 @@ void Jit64::ps_arith(UGeckoInstruction inst)
|
||||||
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD);
|
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD);
|
||||||
break;
|
break;
|
||||||
case 25: // mul
|
case 25: // mul
|
||||||
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD);
|
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, true);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
||||||
|
@ -230,15 +248,17 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
case 12:
|
case 12:
|
||||||
// Single multiply scalar high
|
// Single multiply scalar high
|
||||||
// TODO - faster version for when regs are different
|
// TODO - faster version for when regs are different
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
|
||||||
MOVDDUP(XMM1, fpr.R(c));
|
MOVDDUP(XMM1, fpr.R(c));
|
||||||
|
Force25BitPrecision(XMM1, XMM0);
|
||||||
|
MOVAPD(XMM0, fpr.R(a));
|
||||||
MULPD(XMM0, R(XMM1));
|
MULPD(XMM0, R(XMM1));
|
||||||
MOVAPD(fpr.R(d), XMM0);
|
MOVAPD(fpr.R(d), XMM0);
|
||||||
break;
|
break;
|
||||||
case 13:
|
case 13:
|
||||||
// TODO - faster version for when regs are different
|
// TODO - faster version for when regs are different
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
MOVAPD(XMM1, fpr.R(c));
|
||||||
|
Force25BitPrecision(XMM1, XMM0);
|
||||||
|
MOVAPD(XMM0, fpr.R(a));
|
||||||
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
|
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
|
||||||
MULPD(XMM0, R(XMM1));
|
MULPD(XMM0, R(XMM1));
|
||||||
MOVAPD(fpr.R(d), XMM0);
|
MOVAPD(fpr.R(d), XMM0);
|
||||||
|
@ -300,35 +320,46 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
fpr.Lock(a,b,c,d);
|
fpr.Lock(a,b,c,d);
|
||||||
|
|
||||||
MOVAPD(XMM0, fpr.R(a));
|
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 14: //madds0
|
case 14: //madds0
|
||||||
MOVDDUP(XMM1, fpr.R(c));
|
MOVDDUP(XMM1, fpr.R(c));
|
||||||
|
Force25BitPrecision(XMM1, XMM0);
|
||||||
|
MOVAPD(XMM0, fpr.R(a));
|
||||||
MULPD(XMM0, R(XMM1));
|
MULPD(XMM0, R(XMM1));
|
||||||
ADDPD(XMM0, fpr.R(b));
|
ADDPD(XMM0, fpr.R(b));
|
||||||
break;
|
break;
|
||||||
case 15: //madds1
|
case 15: //madds1
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
MOVAPD(XMM1, fpr.R(c));
|
||||||
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
|
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
|
||||||
|
Force25BitPrecision(XMM1, XMM0);
|
||||||
|
MOVAPD(XMM0, fpr.R(a));
|
||||||
MULPD(XMM0, R(XMM1));
|
MULPD(XMM0, R(XMM1));
|
||||||
ADDPD(XMM0, fpr.R(b));
|
ADDPD(XMM0, fpr.R(b));
|
||||||
break;
|
break;
|
||||||
case 28: //msub
|
case 28: //msub
|
||||||
MULPD(XMM0, fpr.R(c));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
|
MULPD(XMM0, fpr.R(a));
|
||||||
SUBPD(XMM0, fpr.R(b));
|
SUBPD(XMM0, fpr.R(b));
|
||||||
break;
|
break;
|
||||||
case 29: //madd
|
case 29: //madd
|
||||||
MULPD(XMM0, fpr.R(c));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
|
MULPD(XMM0, fpr.R(a));
|
||||||
ADDPD(XMM0, fpr.R(b));
|
ADDPD(XMM0, fpr.R(b));
|
||||||
break;
|
break;
|
||||||
case 30: //nmsub
|
case 30: //nmsub
|
||||||
MULPD(XMM0, fpr.R(c));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
|
MULPD(XMM0, fpr.R(a));
|
||||||
SUBPD(XMM0, fpr.R(b));
|
SUBPD(XMM0, fpr.R(b));
|
||||||
PXOR(XMM0, M((void*)&psSignBits));
|
PXOR(XMM0, M((void*)&psSignBits));
|
||||||
break;
|
break;
|
||||||
case 31: //nmadd
|
case 31: //nmadd
|
||||||
MULPD(XMM0, fpr.R(c));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
|
MULPD(XMM0, fpr.R(a));
|
||||||
ADDPD(XMM0, fpr.R(b));
|
ADDPD(XMM0, fpr.R(b));
|
||||||
PXOR(XMM0, M((void*)&psSignBits));
|
PXOR(XMM0, M((void*)&psSignBits));
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -519,6 +519,25 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const u64 GC_ALIGNED16(psMantissaTruncate[2]) = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL};
|
||||||
|
static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000};
|
||||||
|
|
||||||
|
// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
|
||||||
|
// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
|
||||||
|
// rounding to nearest as it does.
|
||||||
|
// It needs a temp, so let the caller pass that in.
|
||||||
|
void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp)
|
||||||
|
{
|
||||||
|
if (jit->jo.accurateSinglePrecision)
|
||||||
|
{
|
||||||
|
// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
|
||||||
|
MOVAPD(tmp, R(xmm));
|
||||||
|
PAND(xmm, M((void*)&psMantissaTruncate));
|
||||||
|
PAND(tmp, M((void*)&psRoundBit));
|
||||||
|
PADDQ(xmm, R(tmp));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static u32 GC_ALIGNED16(temp32);
|
static u32 GC_ALIGNED16(temp32);
|
||||||
static u64 GC_ALIGNED16(temp64);
|
static u64 GC_ALIGNED16(temp64);
|
||||||
|
|
||||||
|
|
|
@ -56,6 +56,7 @@ public:
|
||||||
|
|
||||||
void ForceSinglePrecisionS(Gen::X64Reg xmm);
|
void ForceSinglePrecisionS(Gen::X64Reg xmm);
|
||||||
void ForceSinglePrecisionP(Gen::X64Reg xmm);
|
void ForceSinglePrecisionP(Gen::X64Reg xmm);
|
||||||
|
void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp);
|
||||||
|
|
||||||
// EAX might get trashed
|
// EAX might get trashed
|
||||||
void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
|
void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
|
||||||
|
|
Loading…
Reference in New Issue