diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 084ab24902..87dae220df 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2173,6 +2173,12 @@ void ARM64FloatEmitter::EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Re (DecodeReg(Rn) << 5) | DecodeReg(Rd)); } +void ARM64FloatEmitter::EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +{ + Write32((1 << 30) | (U << 29) | (0b111100011 << 20) | (size << 22) | (opcode << 12) | (1 << 11) | + (DecodeReg(Rn) << 5) | DecodeReg(Rd)); +} + void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "Singles are not supported!"); @@ -2985,6 +2991,28 @@ void ARM64FloatEmitter::FRSQRTE(ARM64Reg Rd, ARM64Reg Rn) EmitScalar2RegMisc(1, IsDouble(Rd) ? 3 : 2, 0x1D, Rd, Rn); } +// Scalar - pairwise +void ARM64FloatEmitter::FADDP(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalarPairwise(1, IsDouble(Rd), 0b01101, Rd, Rn); +} +void ARM64FloatEmitter::FMAXP(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalarPairwise(1, IsDouble(Rd), 0b01111, Rd, Rn); +} +void ARM64FloatEmitter::FMINP(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01111, Rd, Rn); +} +void ARM64FloatEmitter::FMAXNMP(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalarPairwise(1, IsDouble(Rd), 0b01100, Rd, Rn); +} +void ARM64FloatEmitter::FMINNMP(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01100, Rd, Rn); +} + // Scalar - 2 Source void ARM64FloatEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 09a2633660..baaf598741 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -1130,6 +1130,13 @@ public: void FRECPE(ARM64Reg Rd, ARM64Reg Rn); void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn); + // Scalar - pairwise + void FADDP(ARM64Reg Rd, ARM64Reg Rn); + void FMAXP(ARM64Reg Rd, ARM64Reg Rn); + void FMINP(ARM64Reg Rd, ARM64Reg Rn); + void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn); + void FMINNMP(ARM64Reg Rd, ARM64Reg Rn); + // Scalar - 2 Source void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -1296,6 +1303,7 @@ private: void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 9372fffd2d..ae87d815c8 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -177,6 +177,10 @@ public: void FloatCompare(UGeckoInstruction inst, bool upper = false); + // temp_gpr can be INVALID_REG if single is true + void EmitQuietNaNBitConstant(Arm64Gen::ARM64Reg dest_reg, bool single, + Arm64Gen::ARM64Reg temp_gpr); + bool IsFPRStoreSafe(size_t guest_reg) const; protected: diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 26c6dfd1b7..c5624ee6d4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -3,6 +3,8 @@ #include "Core/PowerPC/JitArm64/Jit.h" +#include + #include "Common/Arm64Emitter.h" #include "Common/CPUDetect.h" #include "Common/CommonTypes.h" @@ -66,11 +68,19 @@ void JitArm64::fp_arith(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); FALLBACK_IF(jo.fp_exceptions || (jo.div_by_zero_exceptions && inst.SUBOP5 == 18)); - u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - u32 op5 = inst.SUBOP5; + const u32 a = inst.FA; + const u32 b = inst.FB; + const u32 c = inst.FC; + const u32 d = inst.FD; + const u32 op5 = inst.SUBOP5; const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX const bool use_b = op5 != 25; // fmul uses no B + const bool fma = use_b && use_c; + const bool negate_result = (op5 & ~0x1) == 30; + + // Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs + const bool can_generate_nan = (op5 & ~0x1) != 20; const bool output_is_single = inst.OPCD == 59; const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA); @@ -82,53 +92,69 @@ void JitArm64::fp_arith(UGeckoInstruction inst) }; const bool inputs_are_singles = inputs_are_singles_func(); - const RegType type = - (inputs_are_singles && output_is_single) ? RegType::LowerPairSingle : RegType::LowerPair; + const bool single = inputs_are_singles && output_is_single; + const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair; const RegType type_out = output_is_single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) : RegType::LowerPair; - const auto reg_encoder = - (inputs_are_singles && output_is_single) ? EncodeRegToSingle : EncodeRegToDouble; + const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble; const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; - ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; + const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out)); ARM64Reg V0Q = ARM64Reg::INVALID_REG; ARM64Reg V1Q = ARM64Reg::INVALID_REG; + ARM64Reg rounded_c_reg = VC; if (round_c) { ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); - V1Q = fpr.GetReg(); - - Force25BitPrecision(reg_encoder(V1Q), VC); - VC = reg_encoder(V1Q); + V0Q = fpr.GetReg(); + rounded_c_reg = reg_encoder(V0Q); + Force25BitPrecision(rounded_c_reg, VC); } - ARM64Reg inaccurate_fma_temp_reg = VD; - if (inaccurate_fma && d == b) + ARM64Reg inaccurate_fma_reg = VD; + if (fma && inaccurate_fma && VD == VB) { - V0Q = fpr.GetReg(); + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); + inaccurate_fma_reg = reg_encoder(V0Q); + } - inaccurate_fma_temp_reg = reg_encoder(V0Q); + ARM64Reg result_reg = VD; + const bool preserve_d = + m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); + if (preserve_d) + { + V1Q = fpr.GetReg(); + result_reg = reg_encoder(V1Q); + } + + const ARM64Reg temp_gpr = m_accurate_nans && !single ? gpr.GetReg() : ARM64Reg::INVALID_REG; + + if (m_accurate_nans) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); } switch (op5) { case 18: - m_float_emit.FDIV(VD, VA, VB); + m_float_emit.FDIV(result_reg, VA, VB); break; case 20: - m_float_emit.FSUB(VD, VA, VB); + m_float_emit.FSUB(result_reg, VA, VB); break; case 21: - m_float_emit.FADD(VD, VA, VB); + m_float_emit.FADD(result_reg, VA, VB); break; case 25: - m_float_emit.FMUL(VD, VA, VC); + m_float_emit.FMUL(result_reg, VA, rounded_c_reg); break; // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic], // the subtly different definitions affect how signed zeroes are handled. @@ -138,39 +164,116 @@ void JitArm64::fp_arith(UGeckoInstruction inst) case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)" if (inaccurate_fma) { - m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB); + m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB); } else { - m_float_emit.FNMSUB(VD, VA, VC, VB); + m_float_emit.FNMSUB(result_reg, VA, rounded_c_reg, VB); } - if (op5 == 30) - m_float_emit.FNEG(VD, VD); break; case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)" if (inaccurate_fma) { - m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB); + m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB); } else { - m_float_emit.FMADD(VD, VA, VC, VB); + m_float_emit.FMADD(result_reg, VA, rounded_c_reg, VB); } - if (op5 == 31) - m_float_emit.FNEG(VD, VD); break; default: ASSERT_MSG(DYNA_REC, 0, "fp_arith"); break; } + std::vector nan_fixups; + if (m_accurate_nans) + { + // Check if we need to handle NaNs + m_float_emit.FCMP(result_reg); + FixupBranch no_nan = B(CCFlags::CC_VC); + FixupBranch nan = B(); + SetJumpTarget(no_nan); + + SwitchToFarCode(); + SetJumpTarget(nan); + + const ARM64Reg quiet_bit_reg = reg_encoder(V0Q); + + EmitQuietNaNBitConstant(quiet_bit_reg, inputs_are_singles && output_is_single, temp_gpr); + + std::vector inputs; + inputs.push_back(VA); + if (use_b && VA != VB) + inputs.push_back(VB); + if (use_c && VA != VC && (!use_b || VB != VC)) + inputs.push_back(VC); + + // If any inputs are NaNs, pick the first NaN of them and OR it with the quiet bit + for (size_t i = 0; i < inputs.size(); ++i) + { + // Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at + // least one NaN input + const bool check_input = can_generate_nan || i != inputs.size() - 1; + + const ARM64Reg input = inputs[i]; + FixupBranch skip; + if (check_input) + { + m_float_emit.FCMP(input); + skip = B(CCFlags::CC_VC); + } + + m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(input), + EncodeRegToDouble(quiet_bit_reg)); + nan_fixups.push_back(B()); + + if (check_input) + SetJumpTarget(skip); + } + + std::optional nan_early_fixup; + if (can_generate_nan) + { + // There was no NaN in any of the inputs, so the NaN must have been generated by the + // arithmetic instruction. In this case, the result is already correct. + if (negate_result) + { + if (result_reg != VD) + m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg)); + nan_fixups.push_back(B()); + } + else + { + nan_early_fixup = B(); + } + } + + SwitchToNearCode(); + + if (nan_early_fixup) + SetJumpTarget(*nan_early_fixup); + } + + // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case + // for any of AArch64's FMA instructions, so we negate using a separate instruction. + if (negate_result) + m_float_emit.FNEG(VD, result_reg); + else if (result_reg != VD) + m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg)); + + for (FixupBranch fixup : nan_fixups) + SetJumpTarget(fixup); + if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q); if (V1Q != ARM64Reg::INVALID_REG) fpr.Unlock(V1Q); + if (temp_gpr != ARM64Reg::INVALID_REG) + gpr.Unlock(temp_gpr); if (output_is_single) { @@ -782,6 +885,29 @@ void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, AR } } +void JitArm64::EmitQuietNaNBitConstant(ARM64Reg dest_reg, bool single, ARM64Reg temp_gpr) +{ + // dest_reg = QNaN & ~SNaN + // + // (Alternatively, dest_reg = QNaN would also work, but that would take + // two instructions to emit even for singles) + + if (single) + { + m_float_emit.MOVI(32, dest_reg, 0x40, 16); + } + else + { + ASSERT(temp_gpr != ARM64Reg::INVALID_REG); + + MOVI2R(EncodeRegTo64(temp_gpr), 0x0008'0000'0000'0000); + if (IsQuad(dest_reg)) + m_float_emit.DUP(64, dest_reg, EncodeRegTo64(temp_gpr)); + else + m_float_emit.FMOV(dest_reg, EncodeRegTo64(temp_gpr)); + } +} + bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const { return js.fpr_is_store_safe[guest_reg]; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 239a235533..4c0730f9d0 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -83,8 +83,14 @@ void JitArm64::ps_arith(UGeckoInstruction inst) const u32 d = inst.FD; const u32 op5 = inst.SUBOP5; + const bool muls = (op5 & ~0x1) == 12; + const bool madds = (op5 & ~0x1) == 14; const bool use_c = op5 == 25 || (op5 & ~0x13) == 12; // mul, muls, and all kinds of maddXX - const bool use_b = op5 != 25 && (op5 & ~0x1) != 12; // mul and muls don't use B + const bool use_b = op5 != 25 && !muls; // mul and muls don't use B + const bool duplicated_c = muls || madds; + const bool fma = use_b && use_c; + const bool negate_result = (op5 & ~0x1) == 30; + const bool msub = op5 == 28 || op5 == 30; const auto singles_func = [&] { return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c)); @@ -99,147 +105,127 @@ void JitArm64::ps_arith(UGeckoInstruction inst) const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; - ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; + const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; const ARM64Reg VD = reg_encoder(fpr.RW(d, type)); ARM64Reg V0Q = ARM64Reg::INVALID_REG; - ARM64Reg V0 = ARM64Reg::INVALID_REG; ARM64Reg V1Q = ARM64Reg::INVALID_REG; + ARM64Reg V2Q = ARM64Reg::INVALID_REG; + ARM64Reg V3Q = ARM64Reg::INVALID_REG; - const auto allocate_v0_if_needed = [&] { - if (V0Q == ARM64Reg::INVALID_REG) - { - V0Q = fpr.GetReg(); - V0 = reg_encoder(V0Q); - } - }; - + ARM64Reg rounded_c_reg = VC; if (round_c) { ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); - V1Q = fpr.GetReg(); - - Force25BitPrecision(reg_encoder(V1Q), VC); - VC = reg_encoder(V1Q); + V0Q = fpr.GetReg(); + rounded_c_reg = reg_encoder(V0Q); + Force25BitPrecision(rounded_c_reg, VC); } - ARM64Reg inaccurate_fma_temp_reg = VD; - if (inaccurate_fma && d == b) + ARM64Reg inaccurate_fma_reg = VD; + if (fma && inaccurate_fma && VD == VB) { - allocate_v0_if_needed(); - inaccurate_fma_temp_reg = V0; + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); + inaccurate_fma_reg = reg_encoder(V0Q); } ARM64Reg result_reg = VD; + const bool need_accurate_fma_reg = + fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg); + const bool preserve_d = + m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); + if (need_accurate_fma_reg || preserve_d) + { + V1Q = fpr.GetReg(); + result_reg = reg_encoder(V1Q); + } + + const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG; + + if (m_accurate_nans) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); + + V2Q = fpr.GetReg(); + + if (duplicated_c || VD == result_reg) + V3Q = fpr.GetReg(); + } + switch (op5) { case 12: // ps_muls0: d = a * c.ps0 - m_float_emit.FMUL(size, VD, VA, VC, 0); + m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 0); break; case 13: // ps_muls1: d = a * c.ps1 - m_float_emit.FMUL(size, VD, VA, VC, 1); + m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1); break; case 14: // ps_madds0: d = a * c.ps0 + b if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC, 0); - m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD == VB) - { - m_float_emit.FMLA(size, VD, VA, VC, 0); - } - else if (VD != VA && VD != VC) - { - m_float_emit.MOV(VD, VB); - m_float_emit.FMLA(size, VD, VA, VC, 0); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0); + m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.MOV(V0, VB); - m_float_emit.FMLA(size, V0, VA, VC, 0); - result_reg = V0; + if (result_reg != VB) + m_float_emit.MOV(result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg, 0); } break; case 15: // ps_madds1: d = a * c.ps1 + b if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC, 1); - m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD == VB) - { - m_float_emit.FMLA(size, VD, VA, VC, 1); - } - else if (VD != VA && VD != VC) - { - m_float_emit.MOV(VD, VB); - m_float_emit.FMLA(size, VD, VA, VC, 1); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1); + m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.MOV(V0, VB); - m_float_emit.FMLA(size, V0, VA, VC, 1); - result_reg = V0; + if (result_reg != VB) + m_float_emit.MOV(result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg, 1); } break; case 18: // ps_div - m_float_emit.FDIV(size, VD, VA, VB); + m_float_emit.FDIV(size, result_reg, VA, VB); break; case 20: // ps_sub - m_float_emit.FSUB(size, VD, VA, VB); + m_float_emit.FSUB(size, result_reg, VA, VB); break; case 21: // ps_add - m_float_emit.FADD(size, VD, VA, VB); + m_float_emit.FADD(size, result_reg, VA, VB); break; case 25: // ps_mul - m_float_emit.FMUL(size, VD, VA, VC); + m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg); break; case 28: // ps_msub: d = a * c - b case 30: // ps_nmsub: d = -(a * c - b) if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FSUB(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD != VA && VD != VC) - { - m_float_emit.FNEG(size, VD, VB); - m_float_emit.FMLA(size, VD, VA, VC); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.FNEG(size, V0, VB); - m_float_emit.FMLA(size, V0, VA, VC); - result_reg = V0; + m_float_emit.FNEG(size, result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg); } break; case 29: // ps_madd: d = a * c + b case 31: // ps_nmadd: d = -(a * c + b) if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD == VB) - { - m_float_emit.FMLA(size, VD, VA, VC); - } - else if (VD != VA && VD != VC) - { - m_float_emit.MOV(VD, VB); - m_float_emit.FMLA(size, VD, VA, VC); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.MOV(V0, VB); - m_float_emit.FMLA(size, V0, VA, VC); - result_reg = V0; + if (result_reg != VB) + m_float_emit.MOV(result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg); } break; default: @@ -247,24 +233,89 @@ void JitArm64::ps_arith(UGeckoInstruction inst) break; } - switch (op5) + FixupBranch nan_fixup; + if (m_accurate_nans) { - case 30: // ps_nmsub - case 31: // ps_nmadd - // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case - // for any of AArch64's FMA instructions, so we negate using a separate instruction. - m_float_emit.FNEG(size, VD, result_reg); - break; - default: - if (result_reg != VD) - m_float_emit.MOV(VD, result_reg); - break; + const ARM64Reg nan_temp_reg = singles ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q); + const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q); + + const ARM64Reg zero_reg = reg_encoder(V2Q); + + // Check if we need to handle NaNs + + m_float_emit.FMAXP(nan_temp_reg, result_reg); + m_float_emit.FCMP(nan_temp_reg); + FixupBranch no_nan = B(CCFlags::CC_VC); + FixupBranch nan = B(); + SetJumpTarget(no_nan); + + SwitchToFarCode(); + SetJumpTarget(nan); + + // Pick the right NaNs + + m_float_emit.MOVI(8, zero_reg, 0); + + const auto check_input = [&](ARM64Reg input) { + m_float_emit.FACGE(size, nan_temp_reg_paired, input, zero_reg); + m_float_emit.BIF(result_reg, input, nan_temp_reg_paired); + }; + + ARM64Reg c_reg_for_nan_purposes = VC; + if (duplicated_c) + { + c_reg_for_nan_purposes = reg_encoder(V3Q); + m_float_emit.DUP(size, c_reg_for_nan_purposes, VC, op5 & 0x1); + } + + if (use_c) + check_input(c_reg_for_nan_purposes); + + if (use_b && (!use_c || VB != c_reg_for_nan_purposes)) + check_input(VB); + + if ((!use_b || VA != VB) && (!use_c || VA != c_reg_for_nan_purposes)) + check_input(VA); + + // Make the NaNs quiet + + const ARM64Reg quiet_bit_reg = VD == result_reg ? reg_encoder(V3Q) : VD; + EmitQuietNaNBitConstant(quiet_bit_reg, singles, temp_gpr); + + m_float_emit.FACGE(size, nan_temp_reg_paired, result_reg, zero_reg); + m_float_emit.ORR(quiet_bit_reg, quiet_bit_reg, result_reg); + if (negate_result) + m_float_emit.FNEG(size, result_reg, result_reg); + if (VD == result_reg) + m_float_emit.BIF(VD, quiet_bit_reg, nan_temp_reg_paired); + else // quiet_bit_reg == VD + m_float_emit.BIT(VD, result_reg, nan_temp_reg_paired); + + nan_fixup = B(); + + SwitchToNearCode(); } + // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case + // for any of AArch64's FMA instructions, so we negate using a separate instruction. + if (negate_result) + m_float_emit.FNEG(size, VD, result_reg); + else if (result_reg != VD) + m_float_emit.MOV(VD, result_reg); + + if (m_accurate_nans) + SetJumpTarget(nan_fixup); + if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q); if (V1Q != ARM64Reg::INVALID_REG) fpr.Unlock(V1Q); + if (V2Q != ARM64Reg::INVALID_REG) + fpr.Unlock(V2Q); + if (V3Q != ARM64Reg::INVALID_REG) + fpr.Unlock(V3Q); + if (temp_gpr != ARM64Reg::INVALID_REG) + gpr.Unlock(temp_gpr); ASSERT_MSG(DYNA_REC, singles == singles_func(), "Register allocation turned singles into doubles in the middle of ps_arith"); @@ -326,32 +377,91 @@ void JitArm64::ps_sumX(UGeckoInstruction inst) const u32 c = inst.FC; const u32 d = inst.FD; - const bool upper = inst.SUBOP5 == 11; + const bool upper = inst.SUBOP5 & 0x1; const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c); const RegType type = singles ? RegType::Single : RegType::Register; const u8 size = singles ? 32 : 64; const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad; + const auto scalar_reg_encoder = singles ? EncodeRegToSingle : EncodeRegToDouble; const ARM64Reg VA = fpr.R(a, type); const ARM64Reg VB = fpr.R(b, type); const ARM64Reg VC = fpr.R(c, type); const ARM64Reg VD = fpr.RW(d, type); const ARM64Reg V0 = fpr.GetReg(); + const ARM64Reg V1 = m_accurate_nans ? fpr.GetReg() : ARM64Reg::INVALID_REG; + const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG; - m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(upper ? VA : VB), upper ? 0 : 1); - if (d != c) + m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1); + + FixupBranch a_nan_done, b_nan_done; + if (m_accurate_nans) { - m_float_emit.FADD(size, reg_encoder(VD), reg_encoder(V0), reg_encoder(upper ? VB : VA)); - m_float_emit.INS(size, VD, upper ? 0 : 1, VC, upper ? 0 : 1); + const auto check_nan = [&](ARM64Reg input) { + m_float_emit.FCMP(scalar_reg_encoder(input)); + FixupBranch not_nan = B(CCFlags::CC_VC); + FixupBranch nan = B(); + SetJumpTarget(not_nan); + + SwitchToFarCode(); + SetJumpTarget(nan); + + EmitQuietNaNBitConstant(scalar_reg_encoder(V1), singles, temp_gpr); + + if (upper) + { + m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input)); + m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V1)); + } + else if (d != c) + { + m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(V1), EncodeRegToDouble(input)); + m_float_emit.INS(size, VD, 1, VC, 1); + } + else + { + m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input)); + m_float_emit.INS(size, VD, 0, V1, 0); + } + + FixupBranch nan_done = B(); + SwitchToNearCode(); + + return nan_done; + }; + + a_nan_done = check_nan(VA); + b_nan_done = check_nan(V0); + } + + if (upper) + { + m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA)); + m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0)); + } + else if (d != c) + { + m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(V0), scalar_reg_encoder(VA)); + m_float_emit.INS(size, VD, 1, VC, 1); } else { - m_float_emit.FADD(size, reg_encoder(V0), reg_encoder(V0), reg_encoder(upper ? VB : VA)); - m_float_emit.INS(size, VD, upper ? 1 : 0, V0, upper ? 1 : 0); + m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA)); + m_float_emit.INS(size, VD, 0, V0, 0); + } + + if (m_accurate_nans) + { + SetJumpTarget(a_nan_done); + SetJumpTarget(b_nan_done); } fpr.Unlock(V0); + if (m_accurate_nans) + fpr.Unlock(V1); + if (temp_gpr != ARM64Reg::INVALID_REG) + gpr.Unlock(temp_gpr); ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)), "Register allocation turned singles into doubles in the middle of ps_sumX");