Merge pull request #12092 from JosJuice/jitarm64-last-nan
JitArm64: Skip checking last input for NaN for non-SIMD operations
This commit is contained in:
commit
934418a289
|
@ -80,9 +80,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
const bool fma = use_b && use_c;
|
const bool fma = use_b && use_c;
|
||||||
const bool negate_result = (op5 & ~0x1) == 30;
|
const bool negate_result = (op5 & ~0x1) == 30;
|
||||||
|
|
||||||
// Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs
|
|
||||||
const bool can_generate_nan = (op5 & ~0x1) != 20;
|
|
||||||
|
|
||||||
const bool output_is_single = inst.OPCD == 59;
|
const bool output_is_single = inst.OPCD == 59;
|
||||||
const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
|
const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
|
||||||
const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
|
const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
|
||||||
|
@ -203,45 +200,35 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
if (use_c && VA != VC && (!use_b || VB != VC))
|
if (use_c && VA != VC && (!use_b || VB != VC))
|
||||||
inputs.push_back(VC);
|
inputs.push_back(VC);
|
||||||
|
|
||||||
// If any inputs are NaNs, pick the first NaN of them and set its quiet bit
|
// If any inputs are NaNs, pick the first NaN of them and set its quiet bit.
|
||||||
for (size_t i = 0; i < inputs.size(); ++i)
|
// However, we can skip checking the last input, because if exactly one input is NaN, AArch64
|
||||||
|
// arithmetic instructions automatically pick that NaN and make it quiet, just like we want.
|
||||||
|
for (size_t i = 0; i < inputs.size() - 1; ++i)
|
||||||
{
|
{
|
||||||
// Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
|
|
||||||
// least one NaN input
|
|
||||||
const bool check_input = can_generate_nan || i != inputs.size() - 1;
|
|
||||||
|
|
||||||
const ARM64Reg input = inputs[i];
|
const ARM64Reg input = inputs[i];
|
||||||
FixupBranch skip;
|
|
||||||
if (check_input)
|
m_float_emit.FCMP(input);
|
||||||
{
|
FixupBranch skip = B(CCFlags::CC_VC);
|
||||||
m_float_emit.FCMP(input);
|
|
||||||
skip = B(CCFlags::CC_VC);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make the NaN quiet
|
// Make the NaN quiet
|
||||||
m_float_emit.FADD(VD, input, input);
|
m_float_emit.FADD(VD, input, input);
|
||||||
|
|
||||||
nan_fixups.push_back(B());
|
nan_fixups.push_back(B());
|
||||||
|
|
||||||
if (check_input)
|
SetJumpTarget(skip);
|
||||||
SetJumpTarget(skip);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::optional<FixupBranch> nan_early_fixup;
|
std::optional<FixupBranch> nan_early_fixup;
|
||||||
if (can_generate_nan)
|
if (negate_result)
|
||||||
{
|
{
|
||||||
// There was no NaN in any of the inputs, so the NaN must have been generated by the
|
// If we have a NaN, we must not execute FNEG.
|
||||||
// arithmetic instruction. In this case, the result is already correct.
|
if (result_reg != VD)
|
||||||
if (negate_result)
|
m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
|
||||||
{
|
nan_fixups.push_back(B());
|
||||||
if (result_reg != VD)
|
}
|
||||||
m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
|
else
|
||||||
nan_fixups.push_back(B());
|
{
|
||||||
}
|
nan_early_fixup = B();
|
||||||
else
|
|
||||||
{
|
|
||||||
nan_early_fixup = B();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SwitchToNearCode();
|
SwitchToNearCode();
|
||||||
|
|
|
@ -380,49 +380,21 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
|
||||||
const ARM64Reg VC = fpr.R(c, type);
|
const ARM64Reg VC = fpr.R(c, type);
|
||||||
const ARM64Reg VD = fpr.RW(d, type);
|
const ARM64Reg VD = fpr.RW(d, type);
|
||||||
const ARM64Reg V0 = fpr.GetReg();
|
const ARM64Reg V0 = fpr.GetReg();
|
||||||
const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG;
|
|
||||||
|
|
||||||
m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);
|
m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);
|
||||||
|
|
||||||
FixupBranch a_nan_done, b_nan_done;
|
|
||||||
if (m_accurate_nans)
|
if (m_accurate_nans)
|
||||||
{
|
{
|
||||||
const auto check_nan = [&](ARM64Reg input) {
|
// If the first input is NaN, set the temp register for the second input to 0. This is because:
|
||||||
m_float_emit.FCMP(scalar_reg_encoder(input));
|
//
|
||||||
FixupBranch not_nan = B(CCFlags::CC_VC);
|
// - If the second input is also NaN, setting it to 0 ensures that the first NaN will be picked.
|
||||||
FixupBranch nan = B();
|
// - If only the first input is NaN, setting the second input to 0 has no effect on the result.
|
||||||
SetJumpTarget(not_nan);
|
//
|
||||||
|
// Either way, we can then do an FADD as usual, and the FADD will make the NaN quiet.
|
||||||
SwitchToFarCode();
|
m_float_emit.FCMP(scalar_reg_encoder(VA));
|
||||||
SetJumpTarget(nan);
|
FixupBranch a_not_nan = B(CCFlags::CC_VC);
|
||||||
|
m_float_emit.MOVI(64, scalar_reg_encoder(V0), 0);
|
||||||
if (upper)
|
SetJumpTarget(a_not_nan);
|
||||||
{
|
|
||||||
m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
|
|
||||||
scalar_reg_encoder(input));
|
|
||||||
m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
|
|
||||||
}
|
|
||||||
else if (d != c)
|
|
||||||
{
|
|
||||||
m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(input),
|
|
||||||
scalar_reg_encoder(input));
|
|
||||||
m_float_emit.INS(size, VD, 1, VC, 1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
|
|
||||||
scalar_reg_encoder(input));
|
|
||||||
m_float_emit.INS(size, VD, 0, V0, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
FixupBranch nan_done = B();
|
|
||||||
SwitchToNearCode();
|
|
||||||
|
|
||||||
return nan_done;
|
|
||||||
};
|
|
||||||
|
|
||||||
a_nan_done = check_nan(VA);
|
|
||||||
b_nan_done = check_nan(V0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (upper)
|
if (upper)
|
||||||
|
@ -441,15 +413,7 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
|
||||||
m_float_emit.INS(size, VD, 0, V0, 0);
|
m_float_emit.INS(size, VD, 0, V0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_accurate_nans)
|
|
||||||
{
|
|
||||||
SetJumpTarget(a_nan_done);
|
|
||||||
SetJumpTarget(b_nan_done);
|
|
||||||
}
|
|
||||||
|
|
||||||
fpr.Unlock(V0);
|
fpr.Unlock(V0);
|
||||||
if (temp_gpr != ARM64Reg::INVALID_REG)
|
|
||||||
gpr.Unlock(temp_gpr);
|
|
||||||
|
|
||||||
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
|
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
|
||||||
"Register allocation turned singles into doubles in the middle of ps_sumX");
|
"Register allocation turned singles into doubles in the middle of ps_sumX");
|
||||||
|
|
Loading…
Reference in New Issue