Merge pull request #12231 from JosJuice/jitarm64-frsqrte-optimization

JitArm64: Optimize frsqrte routine
This commit is contained in:
Mai 2023-11-28 04:19:27 +01:00 committed by GitHub
commit 7e9c19fdb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 27 additions and 30 deletions

View File

@ -686,7 +686,7 @@ void JitArm64::frsqrtex(UGeckoInstruction inst)
const ARM64Reg VB = fpr.R(b, RegType::LowerPair); const ARM64Reg VB = fpr.R(b, RegType::LowerPair);
const ARM64Reg VD = fpr.RW(d, RegType::LowerPair); const ARM64Reg VD = fpr.RW(d, RegType::LowerPair);
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30);
m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB)); m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB));
m_float_emit.FRSQRTE(ARM64Reg::D0, EncodeRegToDouble(VB)); m_float_emit.FRSQRTE(ARM64Reg::D0, EncodeRegToDouble(VB));
@ -697,7 +697,7 @@ void JitArm64::frsqrtex(UGeckoInstruction inst)
SetFPRFIfNeeded(false, ARM64Reg::X0); SetFPRFIfNeeded(false, ARM64Reg::X0);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0); fpr.Unlock(ARM64Reg::Q0);
} }

View File

@ -514,7 +514,7 @@ void JitArm64::ps_rsqrte(UGeckoInstruction inst)
const u32 b = inst.FB; const u32 b = inst.FB;
const u32 d = inst.FD; const u32 d = inst.FD;
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0); fpr.Lock(ARM64Reg::Q0);
const ARM64Reg VB = fpr.R(b, RegType::Register); const ARM64Reg VB = fpr.R(b, RegType::Register);
@ -529,7 +529,7 @@ void JitArm64::ps_rsqrte(UGeckoInstruction inst)
BL(GetAsmRoutines()->frsqrte); BL(GetAsmRoutines()->frsqrte);
m_float_emit.INS(64, EncodeRegToQuad(VD), 1, ARM64Reg::X0); m_float_emit.INS(64, EncodeRegToQuad(VD), 1, ARM64Reg::X0);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0); fpr.Unlock(ARM64Reg::Q0);
fpr.FixSinglePrecision(d); fpr.FixSinglePrecision(d);

View File

@ -307,7 +307,7 @@ void JitArm64::GenerateFres()
} }
// Input: X1 contains input, and D0 contains result of running the input through AArch64 FRSQRTE. // Input: X1 contains input, and D0 contains result of running the input through AArch64 FRSQRTE.
// Output in X0 and memory (PPCState). Clobbers X0-X4 and flags. // Output in X0 and memory (PPCState). Clobbers X0-X3 and flags.
void JitArm64::GenerateFrsqrte() void JitArm64::GenerateFrsqrte()
{ {
// The idea behind this implementation: AArch64's frsqrte instruction calculates the exponent and // The idea behind this implementation: AArch64's frsqrte instruction calculates the exponent and
@ -323,35 +323,32 @@ void JitArm64::GenerateFrsqrte()
CMP(ARM64Reg::X2, ARM64Reg::X3); CMP(ARM64Reg::X2, ARM64Reg::X3);
FixupBranch nan_or_inf = B(CCFlags::CC_EQ); FixupBranch nan_or_inf = B(CCFlags::CC_EQ);
FixupBranch negative = TBNZ(ARM64Reg::X1, 63); FixupBranch negative = TBNZ(ARM64Reg::X1, 63);
AND(ARM64Reg::X3, ARM64Reg::X1, LogicalImm(Common::DOUBLE_FRAC, 64));
FixupBranch normal = CBNZ(ARM64Reg::X2); FixupBranch normal = CBNZ(ARM64Reg::X2);
// "Normalize" denormal values // "Normalize" denormal values.
CLZ(ARM64Reg::X3, ARM64Reg::X3); // The simplified calculation used here results in the upper 11 bits being incorrect,
SUB(ARM64Reg::X4, ARM64Reg::X3, 11); // but that's fine, because the code below never reads those bits.
MOVI2R(ARM64Reg::X2, 0x00C0'0000'0000'0000); CLZ(ARM64Reg::X3, ARM64Reg::X1);
LSLV(ARM64Reg::X4, ARM64Reg::X1, ARM64Reg::X4); LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3);
SUB(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 52)); LSR(ARM64Reg::X1, ARM64Reg::X1, 11);
AND(ARM64Reg::X3, ARM64Reg::X4, LogicalImm(Common::DOUBLE_FRAC - 1, 64)); BFI(ARM64Reg::X1, ARM64Reg::X3, 52, 12);
SetJumpTarget(normal); SetJumpTarget(normal);
LSR(ARM64Reg::X2, ARM64Reg::X2, 48); UBFX(ARM64Reg::X2, ARM64Reg::X1, 48, 5);
AND(ARM64Reg::X2, ARM64Reg::X2, LogicalImm(0x10, 64)); MOVP2R(ARM64Reg::X3, &Common::frsqrte_expected);
MOVP2R(ARM64Reg::X1, &Common::frsqrte_expected); ADD(ARM64Reg::X2, ARM64Reg::X3, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3));
ORR(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSR, 48)); LDP(IndexType::Signed, ARM64Reg::W3, ARM64Reg::W2, ARM64Reg::X2, 0);
ADD(ARM64Reg::X2, ARM64Reg::X1, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); UBFX(ARM64Reg::X1, ARM64Reg::X1, 37, 11);
LDP(IndexType::Signed, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::X2, 0);
UBFX(ARM64Reg::X3, ARM64Reg::X3, 37, 11);
AND(ARM64Reg::X0, ARM64Reg::X0, LogicalImm(Common::DOUBLE_SIGN | Common::DOUBLE_EXP, 64)); AND(ARM64Reg::X0, ARM64Reg::X0, LogicalImm(Common::DOUBLE_SIGN | Common::DOUBLE_EXP, 64));
MADD(ARM64Reg::W3, ARM64Reg::W3, ARM64Reg::W2, ARM64Reg::W1); MADD(ARM64Reg::W1, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3);
ORR(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 26)); ORR(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X1, ArithOption(ARM64Reg::X1, ShiftType::LSL, 26));
RET(); RET();
SetJumpTarget(zero); SetJumpTarget(zero);
LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); LDR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr));
FixupBranch skip_set_zx = TBNZ(ARM64Reg::W4, 26); FixupBranch skip_set_zx = TBNZ(ARM64Reg::W3, 26);
ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2); ORRI2R(ARM64Reg::W3, ARM64Reg::W3, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2);
STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); STR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr));
SetJumpTarget(skip_set_zx); SetJumpTarget(skip_set_zx);
RET(); RET();
@ -361,10 +358,10 @@ void JitArm64::GenerateFrsqrte()
FixupBranch nan_or_positive_inf = B(CCFlags::CC_NEQ); FixupBranch nan_or_positive_inf = B(CCFlags::CC_NEQ);
SetJumpTarget(negative); SetJumpTarget(negative);
LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); LDR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr));
FixupBranch skip_set_vxsqrt = TBNZ(ARM64Reg::W4, 9); FixupBranch skip_set_vxsqrt = TBNZ(ARM64Reg::W3, 9);
ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_VXSQRT, ARM64Reg::W2); ORRI2R(ARM64Reg::W3, ARM64Reg::W3, FPSCR_FX | FPSCR_VXSQRT, ARM64Reg::W2);
STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); STR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr));
SetJumpTarget(skip_set_vxsqrt); SetJumpTarget(skip_set_vxsqrt);
SetJumpTarget(nan_or_positive_inf); SetJumpTarget(nan_or_positive_inf);
RET(); RET();