JitArm64_Paired: Use ScopedARM64Reg

This commit is contained in:
Sintendo 2024-06-23 23:20:32 +02:00
parent 3b251dbb2a
commit be2b466743
1 changed files with 211 additions and 213 deletions

View File

@ -108,201 +108,196 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
const ARM64Reg VD = reg_encoder(fpr.RW(d, type)); const ARM64Reg VD = reg_encoder(fpr.RW(d, type));
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
ARM64Reg V1Q = ARM64Reg::INVALID_REG;
ARM64Reg V2Q = ARM64Reg::INVALID_REG;
ARM64Reg rounded_c_reg = VC;
if (round_c)
{ {
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); Arm64FPRCache::ScopedARM64Reg V0Q = ARM64Reg::INVALID_REG;
Arm64FPRCache::ScopedARM64Reg V1Q = ARM64Reg::INVALID_REG;
Arm64FPRCache::ScopedARM64Reg V2Q = ARM64Reg::INVALID_REG;
V0Q = fpr.GetReg(); ARM64Reg rounded_c_reg = VC;
rounded_c_reg = reg_encoder(V0Q); if (round_c)
Force25BitPrecision(rounded_c_reg, VC);
}
ARM64Reg inaccurate_fma_reg = VD;
if (fma && inaccurate_fma && VD == VB)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetReg();
inaccurate_fma_reg = reg_encoder(V0Q);
}
ARM64Reg result_reg = VD;
const bool need_accurate_fma_reg =
fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg);
const bool preserve_d =
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
if (need_accurate_fma_reg || preserve_d)
{
V1Q = fpr.GetReg();
result_reg = reg_encoder(V1Q);
}
if (m_accurate_nans)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetReg();
if (duplicated_c || VD == result_reg)
V2Q = fpr.GetReg();
}
switch (op5)
{
case 12: // ps_muls0: d = a * c.ps0
m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 0);
break;
case 13: // ps_muls1: d = a * c.ps1
m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1);
break;
case 14: // ps_madds0: d = a * c.ps0 + b
if (inaccurate_fma)
{ {
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0); ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
}
else
{
if (result_reg != VB)
m_float_emit.MOV(result_reg, VB);
m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg, 0);
}
break;
case 15: // ps_madds1: d = a * c.ps1 + b
if (inaccurate_fma)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
}
else
{
if (result_reg != VB)
m_float_emit.MOV(result_reg, VB);
m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg, 1);
}
break;
case 18: // ps_div
m_float_emit.FDIV(size, result_reg, VA, VB);
break;
case 20: // ps_sub
m_float_emit.FSUB(size, result_reg, VA, VB);
break;
case 21: // ps_add
m_float_emit.FADD(size, result_reg, VA, VB);
break;
case 25: // ps_mul
m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg);
break;
case 28: // ps_msub: d = a * c - b
case 30: // ps_nmsub: d = -(a * c - b)
if (inaccurate_fma)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB);
}
else
{
m_float_emit.FNEG(size, result_reg, VB);
m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg);
}
break;
case 29: // ps_madd: d = a * c + b
case 31: // ps_nmadd: d = -(a * c + b)
if (inaccurate_fma)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
}
else
{
if (result_reg != VB)
m_float_emit.MOV(result_reg, VB);
m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg);
}
break;
default:
ASSERT_MSG(DYNA_REC, 0, "ps_arith - invalid op");
break;
}
FixupBranch nan_fixup; V0Q = fpr.GetScopedReg();
if (m_accurate_nans) rounded_c_reg = reg_encoder(V0Q);
{ Force25BitPrecision(rounded_c_reg, VC);
const ARM64Reg nan_temp_reg = singles ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q);
const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q);
// Check if we need to handle NaNs
m_float_emit.FMAXP(nan_temp_reg, result_reg);
m_float_emit.FCMP(nan_temp_reg);
FixupBranch no_nan = B(CCFlags::CC_VC);
FixupBranch nan = B();
SetJumpTarget(no_nan);
SwitchToFarCode();
SetJumpTarget(nan);
// Pick the right NaNs
const auto check_input = [&](ARM64Reg input) {
m_float_emit.FCMEQ(size, nan_temp_reg_paired, input, input);
m_float_emit.BIF(result_reg, input, nan_temp_reg_paired);
};
ARM64Reg c_reg_for_nan_purposes = VC;
if (duplicated_c)
{
c_reg_for_nan_purposes = reg_encoder(V2Q);
m_float_emit.DUP(size, c_reg_for_nan_purposes, VC, op5 & 0x1);
} }
if (use_c) ARM64Reg inaccurate_fma_reg = VD;
check_input(c_reg_for_nan_purposes); if (fma && inaccurate_fma && VD == VB)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
inaccurate_fma_reg = reg_encoder(V0Q);
}
if (use_b && (!use_c || VB != c_reg_for_nan_purposes)) ARM64Reg result_reg = VD;
check_input(VB); const bool need_accurate_fma_reg =
fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg);
const bool preserve_d =
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
if (need_accurate_fma_reg || preserve_d)
{
V1Q = fpr.GetScopedReg();
result_reg = reg_encoder(V1Q);
}
if ((!use_b || VA != VB) && (!use_c || VA != c_reg_for_nan_purposes)) if (m_accurate_nans)
check_input(VA); {
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
// Make the NaNs quiet if (duplicated_c || VD == result_reg)
V2Q = fpr.GetScopedReg();
}
const ARM64Reg quiet_nan_reg = VD == result_reg ? reg_encoder(V2Q) : VD; switch (op5)
{
case 12: // ps_muls0: d = a * c.ps0
m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 0);
break;
case 13: // ps_muls1: d = a * c.ps1
m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1);
break;
case 14: // ps_madds0: d = a * c.ps0 + b
if (inaccurate_fma)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
}
else
{
if (result_reg != VB)
m_float_emit.MOV(result_reg, VB);
m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg, 0);
}
break;
case 15: // ps_madds1: d = a * c.ps1 + b
if (inaccurate_fma)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
}
else
{
if (result_reg != VB)
m_float_emit.MOV(result_reg, VB);
m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg, 1);
}
break;
case 18: // ps_div
m_float_emit.FDIV(size, result_reg, VA, VB);
break;
case 20: // ps_sub
m_float_emit.FSUB(size, result_reg, VA, VB);
break;
case 21: // ps_add
m_float_emit.FADD(size, result_reg, VA, VB);
break;
case 25: // ps_mul
m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg);
break;
case 28: // ps_msub: d = a * c - b
case 30: // ps_nmsub: d = -(a * c - b)
if (inaccurate_fma)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB);
}
else
{
m_float_emit.FNEG(size, result_reg, VB);
m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg);
}
break;
case 29: // ps_madd: d = a * c + b
case 31: // ps_nmadd: d = -(a * c + b)
if (inaccurate_fma)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
}
else
{
if (result_reg != VB)
m_float_emit.MOV(result_reg, VB);
m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg);
}
break;
default:
ASSERT_MSG(DYNA_REC, 0, "ps_arith - invalid op");
break;
}
m_float_emit.FADD(size, quiet_nan_reg, result_reg, result_reg); FixupBranch nan_fixup;
m_float_emit.FCMEQ(size, nan_temp_reg_paired, result_reg, result_reg); if (m_accurate_nans)
{
const ARM64Reg nan_temp_reg = singles ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q);
const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q);
// Check if we need to handle NaNs
m_float_emit.FMAXP(nan_temp_reg, result_reg);
m_float_emit.FCMP(nan_temp_reg);
FixupBranch no_nan = B(CCFlags::CC_VC);
FixupBranch nan = B();
SetJumpTarget(no_nan);
SwitchToFarCode();
SetJumpTarget(nan);
// Pick the right NaNs
const auto check_input = [&](ARM64Reg input) {
m_float_emit.FCMEQ(size, nan_temp_reg_paired, input, input);
m_float_emit.BIF(result_reg, input, nan_temp_reg_paired);
};
ARM64Reg c_reg_for_nan_purposes = VC;
if (duplicated_c)
{
c_reg_for_nan_purposes = reg_encoder(V2Q);
m_float_emit.DUP(size, c_reg_for_nan_purposes, VC, op5 & 0x1);
}
if (use_c)
check_input(c_reg_for_nan_purposes);
if (use_b && (!use_c || VB != c_reg_for_nan_purposes))
check_input(VB);
if ((!use_b || VA != VB) && (!use_c || VA != c_reg_for_nan_purposes))
check_input(VA);
// Make the NaNs quiet
const ARM64Reg quiet_nan_reg = VD == result_reg ? reg_encoder(V2Q) : VD;
m_float_emit.FADD(size, quiet_nan_reg, result_reg, result_reg);
m_float_emit.FCMEQ(size, nan_temp_reg_paired, result_reg, result_reg);
if (negate_result)
m_float_emit.FNEG(size, result_reg, result_reg);
if (VD == result_reg)
m_float_emit.BIF(VD, quiet_nan_reg, nan_temp_reg_paired);
else // quiet_nan_reg == VD
m_float_emit.BIT(VD, result_reg, nan_temp_reg_paired);
nan_fixup = B();
SwitchToNearCode();
}
// PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
// for any of AArch64's FMA instructions, so we negate using a separate instruction.
if (negate_result) if (negate_result)
m_float_emit.FNEG(size, result_reg, result_reg); m_float_emit.FNEG(size, VD, result_reg);
if (VD == result_reg) else if (result_reg != VD)
m_float_emit.BIF(VD, quiet_nan_reg, nan_temp_reg_paired); m_float_emit.MOV(VD, result_reg);
else // quiet_nan_reg == VD
m_float_emit.BIT(VD, result_reg, nan_temp_reg_paired);
nan_fixup = B(); if (m_accurate_nans)
SetJumpTarget(nan_fixup);
SwitchToNearCode();
} }
// PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
// for any of AArch64's FMA instructions, so we negate using a separate instruction.
if (negate_result)
m_float_emit.FNEG(size, VD, result_reg);
else if (result_reg != VD)
m_float_emit.MOV(VD, result_reg);
if (m_accurate_nans)
SetJumpTarget(nan_fixup);
if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q);
if (V1Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V1Q);
if (V2Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V2Q);
ASSERT_MSG(DYNA_REC, singles == singles_func(), ASSERT_MSG(DYNA_REC, singles == singles_func(),
"Register allocation turned singles into doubles in the middle of ps_arith"); "Register allocation turned singles into doubles in the middle of ps_arith");
@ -339,12 +334,11 @@ void JitArm64::ps_sel(UGeckoInstruction inst)
} }
else else
{ {
const ARM64Reg V0Q = fpr.GetReg(); const auto V0Q = fpr.GetScopedReg();
const ARM64Reg V0 = reg_encoder(V0Q); const ARM64Reg V0 = reg_encoder(V0Q);
m_float_emit.FCMGE(size, V0, VA); m_float_emit.FCMGE(size, V0, VA);
m_float_emit.BSL(V0, VC, VB); m_float_emit.BSL(V0, VC, VB);
m_float_emit.MOV(VD, V0); m_float_emit.MOV(VD, V0);
fpr.Unlock(V0Q);
} }
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)), ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
@ -375,41 +369,45 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
const ARM64Reg VB = fpr.R(b, type); const ARM64Reg VB = fpr.R(b, type);
const ARM64Reg VC = fpr.R(c, type); const ARM64Reg VC = fpr.R(c, type);
const ARM64Reg VD = fpr.RW(d, type); const ARM64Reg VD = fpr.RW(d, type);
const ARM64Reg V0 = fpr.GetReg();
m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1); {
const auto V0 = fpr.GetScopedReg();
if (m_accurate_nans) m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);
{
// If the first input is NaN, set the temp register for the second input to 0. This is because:
//
// - If the second input is also NaN, setting it to 0 ensures that the first NaN will be picked.
// - If only the first input is NaN, setting the second input to 0 has no effect on the result.
//
// Either way, we can then do an FADD as usual, and the FADD will make the NaN quiet.
m_float_emit.FCMP(scalar_reg_encoder(VA));
FixupBranch a_not_nan = B(CCFlags::CC_VC);
m_float_emit.MOVI(64, scalar_reg_encoder(V0), 0);
SetJumpTarget(a_not_nan);
}
if (upper) if (m_accurate_nans)
{ {
m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA)); // If the first input is NaN, set the temp register for the second input to 0. This is
m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0)); // because:
} //
else if (d != c) // - If the second input is also NaN, setting it to 0 ensures that the first NaN will be
{ // picked.
m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(V0), scalar_reg_encoder(VA)); // - If only the first input is NaN, setting the second input to 0 has no effect on the
m_float_emit.INS(size, VD, 1, VC, 1); // result.
} //
else // Either way, we can then do an FADD as usual, and the FADD will make the NaN quiet.
{ m_float_emit.FCMP(scalar_reg_encoder(VA));
m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA)); FixupBranch a_not_nan = B(CCFlags::CC_VC);
m_float_emit.INS(size, VD, 0, V0, 0); m_float_emit.MOVI(64, scalar_reg_encoder(V0), 0);
} SetJumpTarget(a_not_nan);
}
fpr.Unlock(V0); if (upper)
{
m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA));
m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
}
else if (d != c)
{
m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(V0), scalar_reg_encoder(VA));
m_float_emit.INS(size, VD, 1, VC, 1);
}
else
{
m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA));
m_float_emit.INS(size, VD, 0, V0, 0);
}
}
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)), ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
"Register allocation turned singles into doubles in the middle of ps_sumX"); "Register allocation turned singles into doubles in the middle of ps_sumX");