JitArm64: Stop using X4 in frsqrte
This required a change in the denormal path where, instead of subtracting 11 before shifting left, we shift left immediately and then shift right by 11. This shouldn't affect performance.
This commit is contained in:
parent
4b2f73774f
commit
9807cf0b82
|
@ -685,7 +685,7 @@ void JitArm64::frsqrtex(UGeckoInstruction inst)
|
|||
const ARM64Reg VB = fpr.R(b, RegType::LowerPair);
|
||||
const ARM64Reg VD = fpr.RW(d, RegType::LowerPair);
|
||||
|
||||
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
|
||||
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30);
|
||||
|
||||
m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB));
|
||||
m_float_emit.FRSQRTE(ARM64Reg::D0, EncodeRegToDouble(VB));
|
||||
|
@ -696,7 +696,7 @@ void JitArm64::frsqrtex(UGeckoInstruction inst)
|
|||
|
||||
SetFPRFIfNeeded(false, ARM64Reg::X0);
|
||||
|
||||
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
|
||||
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30);
|
||||
fpr.Unlock(ARM64Reg::Q0);
|
||||
}
|
||||
|
||||
|
|
|
@ -514,7 +514,7 @@ void JitArm64::ps_rsqrte(UGeckoInstruction inst)
|
|||
const u32 b = inst.FB;
|
||||
const u32 d = inst.FD;
|
||||
|
||||
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
|
||||
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30);
|
||||
fpr.Lock(ARM64Reg::Q0);
|
||||
|
||||
const ARM64Reg VB = fpr.R(b, RegType::Register);
|
||||
|
@ -529,7 +529,7 @@ void JitArm64::ps_rsqrte(UGeckoInstruction inst)
|
|||
BL(GetAsmRoutines()->frsqrte);
|
||||
m_float_emit.INS(64, EncodeRegToQuad(VD), 1, ARM64Reg::X0);
|
||||
|
||||
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
|
||||
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30);
|
||||
fpr.Unlock(ARM64Reg::Q0);
|
||||
|
||||
fpr.FixSinglePrecision(d);
|
||||
|
|
|
@ -311,7 +311,7 @@ void JitArm64::GenerateFres()
|
|||
}
|
||||
|
||||
// Input: X1 contains input, and D0 contains result of running the input through AArch64 FRSQRTE.
|
||||
// Output in X0 and memory (PPCState). Clobbers X0-X4 and flags.
|
||||
// Output in X0 and memory (PPCState). Clobbers X0-X3 and flags.
|
||||
void JitArm64::GenerateFrsqrte()
|
||||
{
|
||||
// The idea behind this implementation: AArch64's frsqrte instruction calculates the exponent and
|
||||
|
@ -332,16 +332,16 @@ void JitArm64::GenerateFrsqrte()
|
|||
|
||||
// "Normalize" denormal values
|
||||
CLZ(ARM64Reg::X3, ARM64Reg::X3);
|
||||
SUB(ARM64Reg::X4, ARM64Reg::X3, 11);
|
||||
MOVI2R(ARM64Reg::X2, 12);
|
||||
LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X4);
|
||||
LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3);
|
||||
LSR(ARM64Reg::X1, ARM64Reg::X1, 11);
|
||||
SUB(ARM64Reg::X3, ARM64Reg::X2, ARM64Reg::X3);
|
||||
BFI(ARM64Reg::X1, ARM64Reg::X3, 52, 12);
|
||||
|
||||
SetJumpTarget(normal);
|
||||
UBFX(ARM64Reg::X2, ARM64Reg::X1, 48, 5);
|
||||
MOVP2R(ARM64Reg::X4, &Common::frsqrte_expected);
|
||||
ADD(ARM64Reg::X2, ARM64Reg::X4, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3));
|
||||
MOVP2R(ARM64Reg::X3, &Common::frsqrte_expected);
|
||||
ADD(ARM64Reg::X2, ARM64Reg::X3, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3));
|
||||
LDP(IndexType::Signed, ARM64Reg::W3, ARM64Reg::W2, ARM64Reg::X2, 0);
|
||||
UBFX(ARM64Reg::X1, ARM64Reg::X1, 37, 11);
|
||||
AND(ARM64Reg::X0, ARM64Reg::X0, LogicalImm(Common::DOUBLE_SIGN | Common::DOUBLE_EXP, 64));
|
||||
|
@ -350,10 +350,10 @@ void JitArm64::GenerateFrsqrte()
|
|||
RET();
|
||||
|
||||
SetJumpTarget(zero);
|
||||
LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr));
|
||||
FixupBranch skip_set_zx = TBNZ(ARM64Reg::W4, 26);
|
||||
ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2);
|
||||
STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr));
|
||||
LDR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr));
|
||||
FixupBranch skip_set_zx = TBNZ(ARM64Reg::W3, 26);
|
||||
ORRI2R(ARM64Reg::W3, ARM64Reg::W3, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2);
|
||||
STR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr));
|
||||
SetJumpTarget(skip_set_zx);
|
||||
RET();
|
||||
|
||||
|
@ -363,10 +363,10 @@ void JitArm64::GenerateFrsqrte()
|
|||
FixupBranch nan_or_positive_inf = B(CCFlags::CC_NEQ);
|
||||
|
||||
SetJumpTarget(negative);
|
||||
LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr));
|
||||
FixupBranch skip_set_vxsqrt = TBNZ(ARM64Reg::W4, 9);
|
||||
ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_VXSQRT, ARM64Reg::W2);
|
||||
STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr));
|
||||
LDR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr));
|
||||
FixupBranch skip_set_vxsqrt = TBNZ(ARM64Reg::W3, 9);
|
||||
ORRI2R(ARM64Reg::W3, ARM64Reg::W3, FPSCR_FX | FPSCR_VXSQRT, ARM64Reg::W2);
|
||||
STR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr));
|
||||
SetJumpTarget(skip_set_vxsqrt);
|
||||
SetJumpTarget(nan_or_positive_inf);
|
||||
RET();
|
||||
|
|
Loading…
Reference in New Issue