diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 2496e74dc8..7fabcd83fb 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -686,7 +686,7 @@ void JitArm64::frsqrtex(UGeckoInstruction inst) const ARM64Reg VB = fpr.R(b, RegType::LowerPair); const ARM64Reg VD = fpr.RW(d, RegType::LowerPair); - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30); m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB)); m_float_emit.FRSQRTE(ARM64Reg::D0, EncodeRegToDouble(VB)); @@ -697,7 +697,7 @@ void JitArm64::frsqrtex(UGeckoInstruction inst) SetFPRFIfNeeded(false, ARM64Reg::X0); - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30); fpr.Unlock(ARM64Reg::Q0); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index a79bf83cb0..d4202875ee 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -514,7 +514,7 @@ void JitArm64::ps_rsqrte(UGeckoInstruction inst) const u32 b = inst.FB; const u32 d = inst.FD; - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30); fpr.Lock(ARM64Reg::Q0); const ARM64Reg VB = fpr.R(b, RegType::Register); @@ -529,7 +529,7 @@ void JitArm64::ps_rsqrte(UGeckoInstruction inst) BL(GetAsmRoutines()->frsqrte); m_float_emit.INS(64, EncodeRegToQuad(VD), 1, ARM64Reg::X0); - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30); fpr.Unlock(ARM64Reg::Q0); fpr.FixSinglePrecision(d); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 9fc224dc08..5c5033d1e3 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -307,7 +307,7 @@ void JitArm64::GenerateFres() } // Input: X1 contains input, and D0 contains result of running the input through AArch64 FRSQRTE. -// Output in X0 and memory (PPCState). Clobbers X0-X4 and flags. +// Output in X0 and memory (PPCState). Clobbers X0-X3 and flags. void JitArm64::GenerateFrsqrte() { // The idea behind this implementation: AArch64's frsqrte instruction calculates the exponent and @@ -323,35 +323,32 @@ void JitArm64::GenerateFrsqrte() CMP(ARM64Reg::X2, ARM64Reg::X3); FixupBranch nan_or_inf = B(CCFlags::CC_EQ); FixupBranch negative = TBNZ(ARM64Reg::X1, 63); - AND(ARM64Reg::X3, ARM64Reg::X1, LogicalImm(Common::DOUBLE_FRAC, 64)); FixupBranch normal = CBNZ(ARM64Reg::X2); - // "Normalize" denormal values - CLZ(ARM64Reg::X3, ARM64Reg::X3); - SUB(ARM64Reg::X4, ARM64Reg::X3, 11); - MOVI2R(ARM64Reg::X2, 0x00C0'0000'0000'0000); - LSLV(ARM64Reg::X4, ARM64Reg::X1, ARM64Reg::X4); - SUB(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 52)); - AND(ARM64Reg::X3, ARM64Reg::X4, LogicalImm(Common::DOUBLE_FRAC - 1, 64)); + // "Normalize" denormal values. + // The simplified calculation used here results in the upper 11 bits being incorrect, + // but that's fine, because the code below never reads those bits. + CLZ(ARM64Reg::X3, ARM64Reg::X1); + LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3); + LSR(ARM64Reg::X1, ARM64Reg::X1, 11); + BFI(ARM64Reg::X1, ARM64Reg::X3, 52, 12); SetJumpTarget(normal); - LSR(ARM64Reg::X2, ARM64Reg::X2, 48); - AND(ARM64Reg::X2, ARM64Reg::X2, LogicalImm(0x10, 64)); - MOVP2R(ARM64Reg::X1, &Common::frsqrte_expected); - ORR(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSR, 48)); - ADD(ARM64Reg::X2, ARM64Reg::X1, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); - LDP(IndexType::Signed, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::X2, 0); - UBFX(ARM64Reg::X3, ARM64Reg::X3, 37, 11); + UBFX(ARM64Reg::X2, ARM64Reg::X1, 48, 5); + MOVP2R(ARM64Reg::X3, &Common::frsqrte_expected); + ADD(ARM64Reg::X2, ARM64Reg::X3, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); + LDP(IndexType::Signed, ARM64Reg::W3, ARM64Reg::W2, ARM64Reg::X2, 0); + UBFX(ARM64Reg::X1, ARM64Reg::X1, 37, 11); AND(ARM64Reg::X0, ARM64Reg::X0, LogicalImm(Common::DOUBLE_SIGN | Common::DOUBLE_EXP, 64)); - MADD(ARM64Reg::W3, ARM64Reg::W3, ARM64Reg::W2, ARM64Reg::W1); - ORR(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 26)); + MADD(ARM64Reg::W1, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3); + ORR(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X1, ArithOption(ARM64Reg::X1, ShiftType::LSL, 26)); RET(); SetJumpTarget(zero); - LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); - FixupBranch skip_set_zx = TBNZ(ARM64Reg::W4, 26); - ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2); - STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); + LDR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr)); + FixupBranch skip_set_zx = TBNZ(ARM64Reg::W3, 26); + ORRI2R(ARM64Reg::W3, ARM64Reg::W3, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2); + STR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr)); SetJumpTarget(skip_set_zx); RET(); @@ -361,10 +358,10 @@ void JitArm64::GenerateFrsqrte() FixupBranch nan_or_positive_inf = B(CCFlags::CC_NEQ); SetJumpTarget(negative); - LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); - FixupBranch skip_set_vxsqrt = TBNZ(ARM64Reg::W4, 9); - ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_VXSQRT, ARM64Reg::W2); - STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); + LDR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr)); + FixupBranch skip_set_vxsqrt = TBNZ(ARM64Reg::W3, 9); + ORRI2R(ARM64Reg::W3, ARM64Reg::W3, FPSCR_FX | FPSCR_VXSQRT, ARM64Reg::W2); + STR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr)); SetJumpTarget(skip_set_vxsqrt); SetJumpTarget(nan_or_positive_inf); RET();