Merge pull request #2652 from Tilka/ps_sum1
Jit64: fix pre-SSE4.1 fallback of ps_sum1 with AccurateNaNs=True
This commit is contained in:
commit
519a7c4a29
|
@ -135,7 +135,8 @@ public:
|
||||||
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
|
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
|
||||||
void SetFPRFIfNeeded(Gen::X64Reg xmm);
|
void SetFPRFIfNeeded(Gen::X64Reg xmm);
|
||||||
|
|
||||||
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in);
|
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in,
|
||||||
|
Gen::X64Reg clobber = Gen::XMM0);
|
||||||
|
|
||||||
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,7 @@ void Jit64::SetFPRFIfNeeded(X64Reg xmm)
|
||||||
SetFPRF(xmm);
|
SetFPRF(xmm);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm)
|
void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Reg clobber)
|
||||||
{
|
{
|
||||||
// | PowerPC | x86
|
// | PowerPC | x86
|
||||||
// ---------------------+----------+---------
|
// ---------------------+----------+---------
|
||||||
|
@ -72,7 +72,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
_assert_(xmm != XMM0);
|
_assert_(xmm != clobber);
|
||||||
|
|
||||||
std::vector<u32> inputs;
|
std::vector<u32> inputs;
|
||||||
u32 a = inst.FA, b = inst.FB, c = inst.FC;
|
u32 a = inst.FA, b = inst.FB, c = inst.FC;
|
||||||
|
@ -110,15 +110,16 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm)
|
||||||
std::reverse(inputs.begin(), inputs.end());
|
std::reverse(inputs.begin(), inputs.end());
|
||||||
if (cpu_info.bSSE4_1)
|
if (cpu_info.bSSE4_1)
|
||||||
{
|
{
|
||||||
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, R(xmm), R(xmm), CMP_UNORD);
|
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, R(xmm), R(xmm), CMP_UNORD);
|
||||||
PTEST(XMM0, R(XMM0));
|
PTEST(clobber, R(clobber));
|
||||||
FixupBranch handle_nan = J_CC(CC_NZ, true);
|
FixupBranch handle_nan = J_CC(CC_NZ, true);
|
||||||
SwitchToFarCode();
|
SwitchToFarCode();
|
||||||
SetJumpTarget(handle_nan);
|
SetJumpTarget(handle_nan);
|
||||||
|
_assert_msg_(DYNA_REC, clobber == XMM0, "BLENDVPD implicitly uses XMM0");
|
||||||
BLENDVPD(xmm, M(psGeneratedQNaN));
|
BLENDVPD(xmm, M(psGeneratedQNaN));
|
||||||
for (u32 x : inputs)
|
for (u32 x : inputs)
|
||||||
{
|
{
|
||||||
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, XMM0, fpr.R(x), fpr.R(x), CMP_UNORD);
|
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, fpr.R(x), fpr.R(x), CMP_UNORD);
|
||||||
BLENDVPD(xmm, fpr.R(x));
|
BLENDVPD(xmm, fpr.R(x));
|
||||||
}
|
}
|
||||||
FixupBranch done = J(true);
|
FixupBranch done = J(true);
|
||||||
|
@ -130,26 +131,26 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm)
|
||||||
// SSE2 fallback
|
// SSE2 fallback
|
||||||
X64Reg tmp = fpr.GetFreeXReg();
|
X64Reg tmp = fpr.GetFreeXReg();
|
||||||
fpr.FlushLockX(tmp);
|
fpr.FlushLockX(tmp);
|
||||||
MOVAPD(XMM0, R(xmm));
|
MOVAPD(clobber, R(xmm));
|
||||||
CMPPD(XMM0, R(XMM0), CMP_UNORD);
|
CMPPD(clobber, R(clobber), CMP_UNORD);
|
||||||
MOVMSKPD(RSCRATCH, R(XMM0));
|
MOVMSKPD(RSCRATCH, R(clobber));
|
||||||
TEST(32, R(RSCRATCH), R(RSCRATCH));
|
TEST(32, R(RSCRATCH), R(RSCRATCH));
|
||||||
FixupBranch handle_nan = J_CC(CC_NZ, true);
|
FixupBranch handle_nan = J_CC(CC_NZ, true);
|
||||||
SwitchToFarCode();
|
SwitchToFarCode();
|
||||||
SetJumpTarget(handle_nan);
|
SetJumpTarget(handle_nan);
|
||||||
MOVAPD(tmp, R(XMM0));
|
MOVAPD(tmp, R(clobber));
|
||||||
PANDN(XMM0, R(xmm));
|
PANDN(clobber, R(xmm));
|
||||||
PAND(tmp, M(psGeneratedQNaN));
|
PAND(tmp, M(psGeneratedQNaN));
|
||||||
POR(tmp, R(XMM0));
|
POR(tmp, R(clobber));
|
||||||
MOVAPD(xmm, R(tmp));
|
MOVAPD(xmm, R(tmp));
|
||||||
for (u32 x : inputs)
|
for (u32 x : inputs)
|
||||||
{
|
{
|
||||||
MOVAPD(XMM0, fpr.R(x));
|
MOVAPD(clobber, fpr.R(x));
|
||||||
CMPPD(XMM0, R(XMM0), CMP_ORD);
|
CMPPD(clobber, R(clobber), CMP_ORD);
|
||||||
MOVAPD(tmp, R(XMM0));
|
MOVAPD(tmp, R(clobber));
|
||||||
PANDN(XMM0, fpr.R(x));
|
PANDN(clobber, fpr.R(x));
|
||||||
PAND(xmm, R(tmp));
|
PAND(xmm, R(tmp));
|
||||||
POR(xmm, R(XMM0));
|
POR(xmm, R(clobber));
|
||||||
}
|
}
|
||||||
FixupBranch done = J(true);
|
FixupBranch done = J(true);
|
||||||
SwitchToNearCode();
|
SwitchToNearCode();
|
||||||
|
|
|
@ -68,7 +68,7 @@ void Jit64::ps_sum(UGeckoInstruction inst)
|
||||||
default:
|
default:
|
||||||
PanicAlert("ps_sum WTF!!!");
|
PanicAlert("ps_sum WTF!!!");
|
||||||
}
|
}
|
||||||
HandleNaNs(inst, fpr.RX(d), tmp);
|
HandleNaNs(inst, fpr.RX(d), tmp, tmp == XMM1 ? XMM0 : XMM1);
|
||||||
ForceSinglePrecision(fpr.RX(d), fpr.R(d));
|
ForceSinglePrecision(fpr.RX(d), fpr.R(d));
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
|
|
Loading…
Reference in New Issue