JitArm64: Implement accurate NaNs

For quite some time now, we've had a setting on x86-64 that makes Dolphin
handle NaNs in a more accurate but slower way. There's only one game that
cares about this, Dragon Ball: Revenge of King Piccolo, and what that game
cares about more specifically is that the default NaN (or "generated NaN"
as I believe it's called in PowerPC documentation) is the same as on
PowerPC. On ARM, the default NaN is the same as on PowerPC, so for the
longest time we didn't need to do anything special to get Dragon Ball:
Revenge of King Piccolo working. However, in 93e636a I changed how we
handle FMA instructions in a way that resulted in the sign of NaNs
becoming inverted for nmadd/nmsub instructions, breaking the game.
To fix this, let's implement the AccurateNaNs setting, like on x86-64.
This commit is contained in:
JosJuice 2022-12-03 17:37:51 +01:00
parent 5c41d3b602
commit 06e60ac327
5 changed files with 329 additions and 15 deletions

View File

@ -2173,6 +2173,12 @@ void ARM64FloatEmitter::EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Re
(DecodeReg(Rn) << 5) | DecodeReg(Rd));
}
void ARM64FloatEmitter::EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
{
Write32((1 << 30) | (U << 29) | (0b111100011 << 20) | (size << 22) | (opcode << 12) | (1 << 11) |
(DecodeReg(Rn) << 5) | DecodeReg(Rd));
}
void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
{
ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "Singles are not supported!");
@ -2985,6 +2991,28 @@ void ARM64FloatEmitter::FRSQRTE(ARM64Reg Rd, ARM64Reg Rn)
EmitScalar2RegMisc(1, IsDouble(Rd) ? 3 : 2, 0x1D, Rd, Rn);
}
// Scalar - pairwise
void ARM64FloatEmitter::FADDP(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalarPairwise(1, IsDouble(Rd), 0b01101, Rd, Rn);
}
void ARM64FloatEmitter::FMAXP(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalarPairwise(1, IsDouble(Rd), 0b01111, Rd, Rn);
}
void ARM64FloatEmitter::FMINP(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01111, Rd, Rn);
}
void ARM64FloatEmitter::FMAXNMP(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalarPairwise(1, IsDouble(Rd), 0b01100, Rd, Rn);
}
void ARM64FloatEmitter::FMINNMP(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01100, Rd, Rn);
}
// Scalar - 2 Source
void ARM64FloatEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{

View File

@ -1130,6 +1130,13 @@ public:
void FRECPE(ARM64Reg Rd, ARM64Reg Rn);
void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn);
// Scalar - pairwise
void FADDP(ARM64Reg Rd, ARM64Reg Rn);
void FMAXP(ARM64Reg Rd, ARM64Reg Rn);
void FMINP(ARM64Reg Rd, ARM64Reg Rn);
void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn);
void FMINNMP(ARM64Reg Rd, ARM64Reg Rn);
// Scalar - 2 Source
void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@ -1296,6 +1303,7 @@ private:
void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
ARM64Reg Rn);

View File

@ -177,6 +177,10 @@ public:
void FloatCompare(UGeckoInstruction inst, bool upper = false);
// temp_gpr can be INVALID_REG if single is true
void EmitQuietNaNBitConstant(Arm64Gen::ARM64Reg dest_reg, bool single,
Arm64Gen::ARM64Reg temp_gpr);
bool IsFPRStoreSafe(size_t guest_reg) const;
protected:

View File

@ -3,6 +3,8 @@
#include "Core/PowerPC/JitArm64/Jit.h"
#include <optional>
#include "Common/Arm64Emitter.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
@ -66,14 +68,20 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
FALLBACK_IF(inst.Rc);
FALLBACK_IF(jo.fp_exceptions || (jo.div_by_zero_exceptions && inst.SUBOP5 == 18));
u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
u32 op5 = inst.SUBOP5;
const u32 a = inst.FA;
const u32 b = inst.FB;
const u32 c = inst.FC;
const u32 d = inst.FD;
const u32 op5 = inst.SUBOP5;
const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
const bool use_b = op5 != 25; // fmul uses no B
const bool fma = use_b && use_c;
const bool negate_result = (op5 & ~0x1) == 30;
// Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs
const bool can_generate_nan = (op5 & ~0x1) != 20;
const bool output_is_single = inst.OPCD == 59;
const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
@ -84,13 +92,12 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
};
const bool inputs_are_singles = inputs_are_singles_func();
const RegType type =
(inputs_are_singles && output_is_single) ? RegType::LowerPairSingle : RegType::LowerPair;
const bool single = inputs_are_singles && output_is_single;
const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
const RegType type_out =
output_is_single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) :
RegType::LowerPair;
const auto reg_encoder =
(inputs_are_singles && output_is_single) ? EncodeRegToSingle : EncodeRegToDouble;
const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
@ -98,6 +105,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out));
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
ARM64Reg V1Q = ARM64Reg::INVALID_REG;
ARM64Reg rounded_c_reg = VC;
if (round_c)
@ -118,6 +126,21 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
}
ARM64Reg result_reg = VD;
const bool preserve_d =
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
if (preserve_d)
{
V1Q = fpr.GetReg();
result_reg = reg_encoder(V1Q);
}
const ARM64Reg temp_gpr = m_accurate_nans && !single ? gpr.GetReg() : ARM64Reg::INVALID_REG;
if (m_accurate_nans)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetReg();
}
switch (op5)
{
@ -166,6 +189,74 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
break;
}
std::vector<FixupBranch> nan_fixups;
if (m_accurate_nans)
{
// Check if we need to handle NaNs
m_float_emit.FCMP(result_reg);
FixupBranch no_nan = B(CCFlags::CC_VC);
FixupBranch nan = B();
SetJumpTarget(no_nan);
SwitchToFarCode();
SetJumpTarget(nan);
const ARM64Reg quiet_bit_reg = reg_encoder(V0Q);
EmitQuietNaNBitConstant(quiet_bit_reg, inputs_are_singles && output_is_single, temp_gpr);
std::vector<ARM64Reg> inputs;
inputs.push_back(VA);
if (use_b && VA != VB)
inputs.push_back(VB);
if (use_c && VA != VC && (!use_b || VB != VC))
inputs.push_back(VC);
// If any inputs are NaNs, pick the first NaN of them and OR it with the quiet bit
for (size_t i = 0; i < inputs.size(); ++i)
{
// Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
// least one NaN input
const bool check_input = can_generate_nan || i != inputs.size() - 1;
const ARM64Reg input = inputs[i];
FixupBranch skip;
if (check_input)
{
m_float_emit.FCMP(input);
skip = B(CCFlags::CC_VC);
}
m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(input),
EncodeRegToDouble(quiet_bit_reg));
nan_fixups.push_back(B());
if (check_input)
SetJumpTarget(skip);
}
std::optional<FixupBranch> nan_early_fixup;
if (can_generate_nan)
{
// There was no NaN in any of the inputs, so the NaN must have been generated by the
// arithmetic instruction. In this case, the result is already correct.
if (negate_result)
{
if (result_reg != VD)
m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
nan_fixups.push_back(B());
}
else
{
nan_early_fixup = B();
}
}
SwitchToNearCode();
if (nan_early_fixup)
SetJumpTarget(*nan_early_fixup);
}
// PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
// for any of AArch64's FMA instructions, so we negate using a separate instruction.
@ -174,8 +265,15 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
else if (result_reg != VD)
m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
for (FixupBranch fixup : nan_fixups)
SetJumpTarget(fixup);
if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q);
if (V1Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V1Q);
if (temp_gpr != ARM64Reg::INVALID_REG)
gpr.Unlock(temp_gpr);
if (output_is_single)
{
@ -787,6 +885,29 @@ void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, AR
}
}
void JitArm64::EmitQuietNaNBitConstant(ARM64Reg dest_reg, bool single, ARM64Reg temp_gpr)
{
// dest_reg = QNaN & ~SNaN
//
// (Alternatively, dest_reg = QNaN would also work, but that would take
// two instructions to emit even for singles)
if (single)
{
m_float_emit.MOVI(32, dest_reg, 0x40, 16);
}
else
{
ASSERT(temp_gpr != ARM64Reg::INVALID_REG);
MOVI2R(EncodeRegTo64(temp_gpr), 0x0008'0000'0000'0000);
if (IsQuad(dest_reg))
m_float_emit.DUP(64, dest_reg, EncodeRegTo64(temp_gpr));
else
m_float_emit.FMOV(dest_reg, EncodeRegTo64(temp_gpr));
}
}
bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const
{
return js.fpr_is_store_safe[guest_reg];

View File

@ -83,8 +83,11 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
const u32 d = inst.FD;
const u32 op5 = inst.SUBOP5;
const bool muls = (op5 & ~0x1) == 12;
const bool madds = (op5 & ~0x1) == 14;
const bool use_c = op5 == 25 || (op5 & ~0x13) == 12; // mul, muls, and all kinds of maddXX
const bool use_b = op5 != 25 && (op5 & ~0x1) != 12; // mul and muls don't use B
const bool use_b = op5 != 25 && !muls; // mul and muls don't use B
const bool duplicated_c = muls || madds;
const bool fma = use_b && use_c;
const bool negate_result = (op5 & ~0x1) == 30;
const bool msub = op5 == 28 || op5 == 30;
@ -107,6 +110,8 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
ARM64Reg V1Q = ARM64Reg::INVALID_REG;
ARM64Reg V2Q = ARM64Reg::INVALID_REG;
ARM64Reg V3Q = ARM64Reg::INVALID_REG;
ARM64Reg rounded_c_reg = VC;
if (round_c)
@ -127,12 +132,29 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
}
ARM64Reg result_reg = VD;
if (fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg))
const bool need_accurate_fma_reg =
fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg);
const bool preserve_d =
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
if (need_accurate_fma_reg || preserve_d)
{
V1Q = fpr.GetReg();
result_reg = reg_encoder(V1Q);
}
const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG;
if (m_accurate_nans)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetReg();
V2Q = fpr.GetReg();
if (duplicated_c || VD == result_reg)
V3Q = fpr.GetReg();
}
switch (op5)
{
case 12: // ps_muls0: d = a * c.ps0
@ -211,6 +233,69 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
break;
}
FixupBranch nan_fixup;
if (m_accurate_nans)
{
const ARM64Reg nan_temp_reg = singles ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q);
const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q);
const ARM64Reg zero_reg = reg_encoder(V2Q);
// Check if we need to handle NaNs
m_float_emit.FMAXP(nan_temp_reg, result_reg);
m_float_emit.FCMP(nan_temp_reg);
FixupBranch no_nan = B(CCFlags::CC_VC);
FixupBranch nan = B();
SetJumpTarget(no_nan);
SwitchToFarCode();
SetJumpTarget(nan);
// Pick the right NaNs
m_float_emit.MOVI(8, zero_reg, 0);
const auto check_input = [&](ARM64Reg input) {
m_float_emit.FACGE(size, nan_temp_reg_paired, input, zero_reg);
m_float_emit.BIF(result_reg, input, nan_temp_reg_paired);
};
ARM64Reg c_reg_for_nan_purposes = VC;
if (duplicated_c)
{
c_reg_for_nan_purposes = reg_encoder(V3Q);
m_float_emit.DUP(size, c_reg_for_nan_purposes, VC, op5 & 0x1);
}
if (use_c)
check_input(c_reg_for_nan_purposes);
if (use_b && (!use_c || VB != c_reg_for_nan_purposes))
check_input(VB);
if ((!use_b || VA != VB) && (!use_c || VA != c_reg_for_nan_purposes))
check_input(VA);
// Make the NaNs quiet
const ARM64Reg quiet_bit_reg = VD == result_reg ? reg_encoder(V3Q) : VD;
EmitQuietNaNBitConstant(quiet_bit_reg, singles, temp_gpr);
m_float_emit.FACGE(size, nan_temp_reg_paired, result_reg, zero_reg);
m_float_emit.ORR(quiet_bit_reg, quiet_bit_reg, result_reg);
if (negate_result)
m_float_emit.FNEG(size, result_reg, result_reg);
if (VD == result_reg)
m_float_emit.BIF(VD, quiet_bit_reg, nan_temp_reg_paired);
else // quiet_bit_reg == VD
m_float_emit.BIT(VD, result_reg, nan_temp_reg_paired);
nan_fixup = B();
SwitchToNearCode();
}
// PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
// for any of AArch64's FMA instructions, so we negate using a separate instruction.
if (negate_result)
@ -218,10 +303,19 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
else if (result_reg != VD)
m_float_emit.MOV(VD, result_reg);
if (m_accurate_nans)
SetJumpTarget(nan_fixup);
if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q);
if (V1Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V1Q);
if (V2Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V2Q);
if (V3Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V3Q);
if (temp_gpr != ARM64Reg::INVALID_REG)
gpr.Unlock(temp_gpr);
ASSERT_MSG(DYNA_REC, singles == singles_func(),
"Register allocation turned singles into doubles in the middle of ps_arith");
@ -283,32 +377,91 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
const u32 c = inst.FC;
const u32 d = inst.FD;
const bool upper = inst.SUBOP5 == 11;
const bool upper = inst.SUBOP5 & 0x1;
const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
const RegType type = singles ? RegType::Single : RegType::Register;
const u8 size = singles ? 32 : 64;
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
const auto scalar_reg_encoder = singles ? EncodeRegToSingle : EncodeRegToDouble;
const ARM64Reg VA = fpr.R(a, type);
const ARM64Reg VB = fpr.R(b, type);
const ARM64Reg VC = fpr.R(c, type);
const ARM64Reg VD = fpr.RW(d, type);
const ARM64Reg V0 = fpr.GetReg();
const ARM64Reg V1 = m_accurate_nans ? fpr.GetReg() : ARM64Reg::INVALID_REG;
const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG;
m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(upper ? VA : VB), upper ? 0 : 1);
if (d != c)
m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);
FixupBranch a_nan_done, b_nan_done;
if (m_accurate_nans)
{
m_float_emit.FADD(size, reg_encoder(VD), reg_encoder(V0), reg_encoder(upper ? VB : VA));
m_float_emit.INS(size, VD, upper ? 0 : 1, VC, upper ? 0 : 1);
const auto check_nan = [&](ARM64Reg input) {
m_float_emit.FCMP(scalar_reg_encoder(input));
FixupBranch not_nan = B(CCFlags::CC_VC);
FixupBranch nan = B();
SetJumpTarget(not_nan);
SwitchToFarCode();
SetJumpTarget(nan);
EmitQuietNaNBitConstant(scalar_reg_encoder(V1), singles, temp_gpr);
if (upper)
{
m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input));
m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V1));
}
else if (d != c)
{
m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(V1), EncodeRegToDouble(input));
m_float_emit.INS(size, VD, 1, VC, 1);
}
else
{
m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input));
m_float_emit.INS(size, VD, 0, V1, 0);
}
FixupBranch nan_done = B();
SwitchToNearCode();
return nan_done;
};
a_nan_done = check_nan(VA);
b_nan_done = check_nan(V0);
}
if (upper)
{
m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA));
m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
}
else if (d != c)
{
m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(V0), scalar_reg_encoder(VA));
m_float_emit.INS(size, VD, 1, VC, 1);
}
else
{
m_float_emit.FADD(size, reg_encoder(V0), reg_encoder(V0), reg_encoder(upper ? VB : VA));
m_float_emit.INS(size, VD, upper ? 1 : 0, V0, upper ? 1 : 0);
m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA));
m_float_emit.INS(size, VD, 0, V0, 0);
}
if (m_accurate_nans)
{
SetJumpTarget(a_nan_done);
SetJumpTarget(b_nan_done);
}
fpr.Unlock(V0);
if (m_accurate_nans)
fpr.Unlock(V1);
if (temp_gpr != ARM64Reg::INVALID_REG)
gpr.Unlock(temp_gpr);
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
"Register allocation turned singles into doubles in the middle of ps_sumX");