Merge pull request #2545 from Tilka/accurate_nans
Jit64: optionally accurate NaNs
This commit is contained in:
commit
39155007c7
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
[Core]
|
[Core]
|
||||||
# Values set here will override the main Dolphin settings.
|
# Values set here will override the main Dolphin settings.
|
||||||
|
AccurateNaNs = True
|
||||||
|
|
||||||
[EmuState]
|
[EmuState]
|
||||||
# The Emulation State. 1 is worst, 5 is best, 0 is not set.
|
# The Emulation State. 1 is worst, 5 is best, 0 is not set.
|
||||||
|
|
|
@ -47,7 +47,7 @@ namespace BootManager
|
||||||
// Apply fire liberally
|
// Apply fire liberally
|
||||||
struct ConfigCache
|
struct ConfigCache
|
||||||
{
|
{
|
||||||
bool valid, bCPUThread, bSkipIdle, bSyncGPUOnSkipIdleHack, bFPRF, bMMU, bDCBZOFF, m_EnableJIT, bDSPThread,
|
bool valid, bCPUThread, bSkipIdle, bSyncGPUOnSkipIdleHack, bFPRF, bAccurateNaNs, bMMU, bDCBZOFF, m_EnableJIT, bDSPThread,
|
||||||
bSyncGPU, bFastDiscSpeed, bDSPHLE, bHLE_BS2, bProgressive;
|
bSyncGPU, bFastDiscSpeed, bDSPHLE, bHLE_BS2, bProgressive;
|
||||||
int iCPUCore, Volume;
|
int iCPUCore, Volume;
|
||||||
int iWiimoteSource[MAX_BBMOTES];
|
int iWiimoteSource[MAX_BBMOTES];
|
||||||
|
@ -106,6 +106,7 @@ bool BootCore(const std::string& _rFilename)
|
||||||
config_cache.bSyncGPUOnSkipIdleHack = StartUp.bSyncGPUOnSkipIdleHack;
|
config_cache.bSyncGPUOnSkipIdleHack = StartUp.bSyncGPUOnSkipIdleHack;
|
||||||
config_cache.iCPUCore = StartUp.iCPUCore;
|
config_cache.iCPUCore = StartUp.iCPUCore;
|
||||||
config_cache.bFPRF = StartUp.bFPRF;
|
config_cache.bFPRF = StartUp.bFPRF;
|
||||||
|
config_cache.bAccurateNaNs = StartUp.bAccurateNaNs;
|
||||||
config_cache.bMMU = StartUp.bMMU;
|
config_cache.bMMU = StartUp.bMMU;
|
||||||
config_cache.bDCBZOFF = StartUp.bDCBZOFF;
|
config_cache.bDCBZOFF = StartUp.bDCBZOFF;
|
||||||
config_cache.bSyncGPU = StartUp.bSyncGPU;
|
config_cache.bSyncGPU = StartUp.bSyncGPU;
|
||||||
|
@ -146,6 +147,7 @@ bool BootCore(const std::string& _rFilename)
|
||||||
core_section->Get("SkipIdle", &StartUp.bSkipIdle, StartUp.bSkipIdle);
|
core_section->Get("SkipIdle", &StartUp.bSkipIdle, StartUp.bSkipIdle);
|
||||||
core_section->Get("SyncOnSkipIdle", &StartUp.bSyncGPUOnSkipIdleHack, StartUp.bSyncGPUOnSkipIdleHack);
|
core_section->Get("SyncOnSkipIdle", &StartUp.bSyncGPUOnSkipIdleHack, StartUp.bSyncGPUOnSkipIdleHack);
|
||||||
core_section->Get("FPRF", &StartUp.bFPRF, StartUp.bFPRF);
|
core_section->Get("FPRF", &StartUp.bFPRF, StartUp.bFPRF);
|
||||||
|
core_section->Get("AccurateNaNs", &StartUp.bAccurateNaNs, StartUp.bAccurateNaNs);
|
||||||
core_section->Get("MMU", &StartUp.bMMU, StartUp.bMMU);
|
core_section->Get("MMU", &StartUp.bMMU, StartUp.bMMU);
|
||||||
core_section->Get("DCBZ", &StartUp.bDCBZOFF, StartUp.bDCBZOFF);
|
core_section->Get("DCBZ", &StartUp.bDCBZOFF, StartUp.bDCBZOFF);
|
||||||
core_section->Get("SyncGPU", &StartUp.bSyncGPU, StartUp.bSyncGPU);
|
core_section->Get("SyncGPU", &StartUp.bSyncGPU, StartUp.bSyncGPU);
|
||||||
|
@ -273,6 +275,7 @@ void Stop()
|
||||||
StartUp.bSyncGPUOnSkipIdleHack = config_cache.bSyncGPUOnSkipIdleHack;
|
StartUp.bSyncGPUOnSkipIdleHack = config_cache.bSyncGPUOnSkipIdleHack;
|
||||||
StartUp.iCPUCore = config_cache.iCPUCore;
|
StartUp.iCPUCore = config_cache.iCPUCore;
|
||||||
StartUp.bFPRF = config_cache.bFPRF;
|
StartUp.bFPRF = config_cache.bFPRF;
|
||||||
|
StartUp.bAccurateNaNs = config_cache.bAccurateNaNs;
|
||||||
StartUp.bMMU = config_cache.bMMU;
|
StartUp.bMMU = config_cache.bMMU;
|
||||||
StartUp.bDCBZOFF = config_cache.bDCBZOFF;
|
StartUp.bDCBZOFF = config_cache.bDCBZOFF;
|
||||||
StartUp.bSyncGPU = config_cache.bSyncGPU;
|
StartUp.bSyncGPU = config_cache.bSyncGPU;
|
||||||
|
|
|
@ -33,7 +33,7 @@ SCoreStartupParameter::SCoreStartupParameter()
|
||||||
bJITPairedOff(false), bJITSystemRegistersOff(false),
|
bJITPairedOff(false), bJITSystemRegistersOff(false),
|
||||||
bJITBranchOff(false),
|
bJITBranchOff(false),
|
||||||
bJITILTimeProfiling(false), bJITILOutputIR(false),
|
bJITILTimeProfiling(false), bJITILOutputIR(false),
|
||||||
bFPRF(false),
|
bFPRF(false), bAccurateNaNs(false),
|
||||||
bCPUThread(true), bDSPThread(false), bDSPHLE(true),
|
bCPUThread(true), bDSPThread(false), bDSPHLE(true),
|
||||||
bSkipIdle(true), bSyncGPUOnSkipIdleHack(true), bNTSC(false), bForceNTSCJ(false),
|
bSkipIdle(true), bSyncGPUOnSkipIdleHack(true), bNTSC(false), bForceNTSCJ(false),
|
||||||
bHLE_BS2(true), bEnableCheats(false),
|
bHLE_BS2(true), bEnableCheats(false),
|
||||||
|
@ -78,6 +78,7 @@ void SCoreStartupParameter::LoadDefaults()
|
||||||
bDSPHLE = true;
|
bDSPHLE = true;
|
||||||
bFastmem = true;
|
bFastmem = true;
|
||||||
bFPRF = false;
|
bFPRF = false;
|
||||||
|
bAccurateNaNs = false;
|
||||||
bMMU = false;
|
bMMU = false;
|
||||||
bDCBZOFF = false;
|
bDCBZOFF = false;
|
||||||
iBBDumpPort = -1;
|
iBBDumpPort = -1;
|
||||||
|
|
|
@ -163,6 +163,7 @@ struct SCoreStartupParameter
|
||||||
|
|
||||||
bool bFastmem;
|
bool bFastmem;
|
||||||
bool bFPRF;
|
bool bFPRF;
|
||||||
|
bool bAccurateNaNs;
|
||||||
|
|
||||||
bool bCPUThread;
|
bool bCPUThread;
|
||||||
bool bDSPThread;
|
bool bDSPThread;
|
||||||
|
|
|
@ -135,13 +135,18 @@ public:
|
||||||
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
|
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
|
||||||
void SetFPRFIfNeeded(Gen::X64Reg xmm);
|
void SetFPRFIfNeeded(Gen::X64Reg xmm);
|
||||||
|
|
||||||
|
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in);
|
||||||
|
|
||||||
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
||||||
|
|
||||||
typedef u32 (*Operation)(u32 a, u32 b);
|
typedef u32 (*Operation)(u32 a, u32 b);
|
||||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
|
void regimmop(int d, int a, bool binary, u32 value, Operation doop,
|
||||||
bool Rc = false, bool carry = false);
|
void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
|
||||||
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&),
|
bool Rc = false, bool carry = false);
|
||||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&), bool packed = false, bool roundRHS = false);
|
Gen::X64Reg fp_tri_op(int d, int a, int b, bool reversible, bool single,
|
||||||
|
void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&),
|
||||||
|
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&),
|
||||||
|
bool packed, bool preserve_inputs, bool roundRHS = false);
|
||||||
void FloatCompare(UGeckoInstruction inst, bool upper = false);
|
void FloatCompare(UGeckoInstruction inst, bool upper = false);
|
||||||
|
|
||||||
// OPCODES
|
// OPCODES
|
||||||
|
|
|
@ -138,6 +138,20 @@ public:
|
||||||
LockX(args...);
|
LockX(args...);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
void UnlockX(T x)
|
||||||
|
{
|
||||||
|
if (!xregs[x].locked)
|
||||||
|
PanicAlert("RegCache: x %i already unlocked!", x);
|
||||||
|
xregs[x].locked = false;
|
||||||
|
}
|
||||||
|
template<typename T, typename... Args>
|
||||||
|
void UnlockX(T first, Args... args)
|
||||||
|
{
|
||||||
|
UnlockX(first);
|
||||||
|
UnlockX(args...);
|
||||||
|
}
|
||||||
|
|
||||||
void UnlockAll();
|
void UnlockAll();
|
||||||
void UnlockAllX();
|
void UnlockAllX();
|
||||||
|
|
||||||
|
|
|
@ -10,38 +10,37 @@
|
||||||
|
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
|
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
|
||||||
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
||||||
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
|
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
|
||||||
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
||||||
|
static const u64 GC_ALIGNED16(psGeneratedQNaN[2]) = {0x7FF8000000000000ULL, 0x7FF8000000000000ULL};
|
||||||
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
||||||
|
|
||||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
|
X64Reg Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
|
||||||
void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool packed, bool roundRHS)
|
void (XEmitter::*sseOp)(X64Reg, const OpArg&), bool packed, bool preserve_inputs, bool roundRHS)
|
||||||
{
|
{
|
||||||
fpr.Lock(d, a, b);
|
fpr.Lock(d, a, b);
|
||||||
fpr.BindToRegister(d, d == a || d == b || !single);
|
fpr.BindToRegister(d, d == a || d == b || !single);
|
||||||
|
X64Reg dest = preserve_inputs ? XMM1 : fpr.RX(d);
|
||||||
if (roundRHS)
|
if (roundRHS)
|
||||||
{
|
{
|
||||||
if (d == a)
|
if (d == a && !preserve_inputs)
|
||||||
{
|
{
|
||||||
Force25BitPrecision(XMM0, fpr.R(b), XMM1);
|
Force25BitPrecision(XMM0, fpr.R(b), XMM1);
|
||||||
(this->*sseOp)(fpr.RX(d), R(XMM0));
|
(this->*sseOp)(fpr.RX(d), R(XMM0));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0);
|
Force25BitPrecision(dest, fpr.R(b), XMM0);
|
||||||
(this->*sseOp)(fpr.RX(d), fpr.R(a));
|
(this->*sseOp)(dest, fpr.R(a));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible);
|
avx_op(avxOp, sseOp, dest, fpr.R(a), fpr.R(b), packed, reversible);
|
||||||
}
|
}
|
||||||
if (single)
|
return dest;
|
||||||
ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
|
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// We can avoid calculating FPRF if it's not needed; every float operation resets it, so
|
// We can avoid calculating FPRF if it's not needed; every float operation resets it, so
|
||||||
|
@ -56,6 +55,112 @@ void Jit64::SetFPRFIfNeeded(X64Reg xmm)
|
||||||
SetFPRF(xmm);
|
SetFPRF(xmm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm)
|
||||||
|
{
|
||||||
|
// | PowerPC | x86
|
||||||
|
// ---------------------+----------+---------
|
||||||
|
// input NaN precedence | 1*3 + 2 | 1*2 + 3
|
||||||
|
// generated QNaN | positive | negative
|
||||||
|
//
|
||||||
|
// Dragon Ball: Revenge of King Piccolo requires generated NaNs
|
||||||
|
// to be positive, so we'll have to handle them manually.
|
||||||
|
|
||||||
|
if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bAccurateNaNs)
|
||||||
|
{
|
||||||
|
if (xmm_out != xmm)
|
||||||
|
MOVAPD(xmm_out, R(xmm));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
_assert_(xmm != XMM0);
|
||||||
|
|
||||||
|
std::vector<u32> inputs;
|
||||||
|
u32 a = inst.FA, b = inst.FB, c = inst.FC;
|
||||||
|
for (u32 i : {a, b, c})
|
||||||
|
{
|
||||||
|
if (!js.op->fregsIn[i])
|
||||||
|
continue;
|
||||||
|
if (std::find(inputs.begin(), inputs.end(), i) == inputs.end())
|
||||||
|
inputs.push_back(i);
|
||||||
|
}
|
||||||
|
if (inst.OPCD != 4)
|
||||||
|
{
|
||||||
|
// not paired-single
|
||||||
|
UCOMISD(xmm, R(xmm));
|
||||||
|
FixupBranch handle_nan = J_CC(CC_P, true);
|
||||||
|
SwitchToFarCode();
|
||||||
|
SetJumpTarget(handle_nan);
|
||||||
|
std::vector<FixupBranch> fixups;
|
||||||
|
for (u32 x : inputs)
|
||||||
|
{
|
||||||
|
MOVDDUP(xmm, fpr.R(x));
|
||||||
|
UCOMISD(xmm, R(xmm));
|
||||||
|
fixups.push_back(J_CC(CC_P));
|
||||||
|
}
|
||||||
|
MOVDDUP(xmm, M(psGeneratedQNaN));
|
||||||
|
for (FixupBranch fixup : fixups)
|
||||||
|
SetJumpTarget(fixup);
|
||||||
|
FixupBranch done = J(true);
|
||||||
|
SwitchToNearCode();
|
||||||
|
SetJumpTarget(done);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// paired-single
|
||||||
|
std::reverse(inputs.begin(), inputs.end());
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, R(xmm), R(xmm), CMP_UNORD);
|
||||||
|
PTEST(XMM0, R(XMM0));
|
||||||
|
FixupBranch handle_nan = J_CC(CC_NZ, true);
|
||||||
|
SwitchToFarCode();
|
||||||
|
SetJumpTarget(handle_nan);
|
||||||
|
BLENDVPD(xmm, M(psGeneratedQNaN));
|
||||||
|
for (u32 x : inputs)
|
||||||
|
{
|
||||||
|
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, fpr.R(x), fpr.R(x), CMP_UNORD);
|
||||||
|
BLENDVPD(xmm, fpr.R(x));
|
||||||
|
}
|
||||||
|
FixupBranch done = J(true);
|
||||||
|
SwitchToNearCode();
|
||||||
|
SetJumpTarget(done);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// SSE2 fallback
|
||||||
|
X64Reg tmp = fpr.GetFreeXReg();
|
||||||
|
fpr.FlushLockX(tmp);
|
||||||
|
MOVAPD(XMM0, R(xmm));
|
||||||
|
CMPPD(XMM0, R(XMM0), CMP_UNORD);
|
||||||
|
MOVMSKPD(RSCRATCH, R(XMM0));
|
||||||
|
TEST(32, R(RSCRATCH), R(RSCRATCH));
|
||||||
|
FixupBranch handle_nan = J_CC(CC_NZ, true);
|
||||||
|
SwitchToFarCode();
|
||||||
|
SetJumpTarget(handle_nan);
|
||||||
|
MOVAPD(tmp, R(XMM0));
|
||||||
|
PANDN(XMM0, R(xmm));
|
||||||
|
PAND(tmp, M(psGeneratedQNaN));
|
||||||
|
POR(tmp, R(XMM0));
|
||||||
|
MOVAPD(xmm, R(tmp));
|
||||||
|
for (u32 x : inputs)
|
||||||
|
{
|
||||||
|
MOVAPD(XMM0, fpr.R(x));
|
||||||
|
CMPPD(XMM0, R(XMM0), CMP_ORD);
|
||||||
|
MOVAPD(tmp, R(XMM0));
|
||||||
|
PANDN(XMM0, fpr.R(x));
|
||||||
|
PAND(xmm, R(tmp));
|
||||||
|
POR(xmm, R(XMM0));
|
||||||
|
}
|
||||||
|
FixupBranch done = J(true);
|
||||||
|
SwitchToNearCode();
|
||||||
|
SetJumpTarget(done);
|
||||||
|
fpr.UnlockX(tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (xmm_out != xmm)
|
||||||
|
MOVAPD(xmm_out, R(xmm));
|
||||||
|
}
|
||||||
|
|
||||||
void Jit64::fp_arith(UGeckoInstruction inst)
|
void Jit64::fp_arith(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
|
@ -80,20 +185,27 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
||||||
packed = false;
|
packed = false;
|
||||||
|
|
||||||
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
|
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
|
||||||
|
bool preserve_inputs = SConfig::GetInstance().m_LocalCoreStartupParameter.bAccurateNaNs;
|
||||||
|
|
||||||
|
X64Reg dest = INVALID_REG;
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
|
case 18: dest = fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
|
||||||
packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed); break;
|
packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, packed, preserve_inputs); break;
|
||||||
case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
|
case 20: dest = fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
|
||||||
packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed); break;
|
packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, packed, preserve_inputs); break;
|
||||||
case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
|
case 21: dest = fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
|
||||||
packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed); break;
|
packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, packed, preserve_inputs); break;
|
||||||
case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
|
case 25: dest = fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
|
||||||
packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, round_input); break;
|
packed ? &XEmitter::MULPD : &XEmitter::MULSD, packed, preserve_inputs, round_input); break;
|
||||||
default:
|
default:
|
||||||
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
||||||
}
|
}
|
||||||
|
HandleNaNs(inst, fpr.RX(d), dest);
|
||||||
|
if (single)
|
||||||
|
ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
|
||||||
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::fmaddXX(UGeckoInstruction inst)
|
void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
|
@ -220,13 +332,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
if (inst.SUBOP5 == 31) //nmadd
|
if (inst.SUBOP5 == 31) //nmadd
|
||||||
PXOR(XMM1, M(packed ? psSignBits2 : psSignBits));
|
PXOR(XMM1, M(packed ? psSignBits2 : psSignBits));
|
||||||
}
|
}
|
||||||
|
|
||||||
fpr.BindToRegister(d, !single);
|
fpr.BindToRegister(d, !single);
|
||||||
|
|
||||||
if (single)
|
if (single)
|
||||||
ForceSinglePrecision(fpr.RX(d), R(XMM1), packed, true);
|
{
|
||||||
|
HandleNaNs(inst, fpr.RX(d), XMM1);
|
||||||
|
ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
|
HandleNaNs(inst, XMM1, XMM1);
|
||||||
MOVSD(fpr.RX(d), R(XMM1));
|
MOVSD(fpr.RX(d), R(XMM1));
|
||||||
|
}
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
@ -379,7 +495,6 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception?
|
|
||||||
UCOMISD(fpr.RX(b), fpr.R(a));
|
UCOMISD(fpr.RX(b), fpr.R(a));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ void Jit64::ps_sum(UGeckoInstruction inst)
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
OpArg op_a = fpr.R(a);
|
OpArg op_a = fpr.R(a);
|
||||||
fpr.BindToRegister(d, d == b || d == c);
|
fpr.BindToRegister(d, d == b || d == c);
|
||||||
X64Reg tmp = XMM0;
|
X64Reg tmp = XMM1;
|
||||||
MOVDDUP(tmp, op_a); // {a.ps0, a.ps0}
|
MOVDDUP(tmp, op_a); // {a.ps0, a.ps0}
|
||||||
ADDPD(tmp, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
|
ADDPD(tmp, fpr.R(b)); // {a.ps0 + b.ps0, a.ps0 + b.ps1}
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
|
@ -55,9 +55,9 @@ void Jit64::ps_sum(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MOVAPD(XMM1, fpr.R(c));
|
MOVAPD(XMM0, fpr.R(c));
|
||||||
SHUFPD(XMM1, R(tmp), 2);
|
SHUFPD(XMM0, R(tmp), 2);
|
||||||
tmp = XMM1;
|
tmp = XMM0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -68,7 +68,8 @@ void Jit64::ps_sum(UGeckoInstruction inst)
|
||||||
default:
|
default:
|
||||||
PanicAlert("ps_sum WTF!!!");
|
PanicAlert("ps_sum WTF!!!");
|
||||||
}
|
}
|
||||||
ForceSinglePrecision(fpr.RX(d), R(tmp));
|
HandleNaNs(inst, fpr.RX(d), tmp);
|
||||||
|
ForceSinglePrecision(fpr.RX(d), fpr.R(d));
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
@ -88,19 +89,20 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 12: // ps_muls0
|
case 12: // ps_muls0
|
||||||
MOVDDUP(XMM0, fpr.R(c));
|
MOVDDUP(XMM1, fpr.R(c));
|
||||||
break;
|
break;
|
||||||
case 13: // ps_muls1
|
case 13: // ps_muls1
|
||||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, fpr.R(c), fpr.R(c), 3);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
PanicAlert("ps_muls WTF!!!");
|
PanicAlert("ps_muls WTF!!!");
|
||||||
}
|
}
|
||||||
if (round_input)
|
if (round_input)
|
||||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
||||||
MULPD(XMM0, fpr.R(a));
|
MULPD(XMM1, fpr.R(a));
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
ForceSinglePrecision(fpr.RX(d), R(XMM0));
|
HandleNaNs(inst, fpr.RX(d), XMM1);
|
||||||
|
ForceSinglePrecision(fpr.RX(d), fpr.R(d));
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue