From 31d751b6daafd3a73f9c9194d9cbc5cf1f7342af Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 11 Oct 2023 18:48:47 +0200 Subject: [PATCH 1/6] JitArm64: Read X1 in frsqrte normal path Instead of combining X2 (the exponent) and X3 (the mantissa) using an ORR instruction, we can read X1, which already contains both. This requires us to reconstruct X1 in the denormal path, but that's an acceptable price. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 77d64de300..f424281afd 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -337,13 +337,12 @@ void JitArm64::GenerateFrsqrte() LSLV(ARM64Reg::X4, ARM64Reg::X1, ARM64Reg::X4); SUB(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 52)); AND(ARM64Reg::X3, ARM64Reg::X4, LogicalImm(Common::DOUBLE_FRAC - 1, 64)); + ORR(ARM64Reg::X1, ARM64Reg::X2, ARM64Reg::X3); SetJumpTarget(normal); - LSR(ARM64Reg::X2, ARM64Reg::X2, 48); - AND(ARM64Reg::X2, ARM64Reg::X2, LogicalImm(0x10, 64)); - MOVP2R(ARM64Reg::X1, &Common::frsqrte_expected); - ORR(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSR, 48)); - ADD(ARM64Reg::X2, ARM64Reg::X1, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); + UBFX(ARM64Reg::X2, ARM64Reg::X1, 48, 5); + MOVP2R(ARM64Reg::X4, &Common::frsqrte_expected); + ADD(ARM64Reg::X2, ARM64Reg::X4, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); LDP(IndexType::Signed, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::X2, 0); UBFX(ARM64Reg::X3, ARM64Reg::X3, 37, 11); AND(ARM64Reg::X0, ARM64Reg::X0, LogicalImm(Common::DOUBLE_SIGN | Common::DOUBLE_EXP, 64)); From e5bd8019f6184661a7a6b7e534894bd5dfeda5eb Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 11 Oct 2023 19:23:51 +0200 Subject: [PATCH 2/6] JitArm64: Read X1 instead of X3 in frsqrte normal path With this, the normal path is no longer reading the value in X3, which opens up for the next commit. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index f424281afd..9771d9118a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -343,11 +343,11 @@ void JitArm64::GenerateFrsqrte() UBFX(ARM64Reg::X2, ARM64Reg::X1, 48, 5); MOVP2R(ARM64Reg::X4, &Common::frsqrte_expected); ADD(ARM64Reg::X2, ARM64Reg::X4, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); - LDP(IndexType::Signed, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::X2, 0); - UBFX(ARM64Reg::X3, ARM64Reg::X3, 37, 11); + LDP(IndexType::Signed, ARM64Reg::W3, ARM64Reg::W2, ARM64Reg::X2, 0); + UBFX(ARM64Reg::X1, ARM64Reg::X1, 37, 11); AND(ARM64Reg::X0, ARM64Reg::X0, LogicalImm(Common::DOUBLE_SIGN | Common::DOUBLE_EXP, 64)); - MADD(ARM64Reg::W3, ARM64Reg::W3, ARM64Reg::W2, ARM64Reg::W1); - ORR(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 26)); + MADD(ARM64Reg::W1, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3); + ORR(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X1, ArithOption(ARM64Reg::X1, ShiftType::LSL, 26)); RET(); SetJumpTarget(zero); From 4b2f73774f10ef72e037f7a25c0ce92e2b9bdb6d Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 11 Oct 2023 19:38:46 +0200 Subject: [PATCH 3/6] JitArm64: Optimize frsqrte denormal path Now that the normal path is no longer reading X2 and X3, it doesn't matter what the denormal path writes to them. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 9771d9118a..fd1693f62a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -333,11 +333,10 @@ void JitArm64::GenerateFrsqrte() // "Normalize" denormal values CLZ(ARM64Reg::X3, ARM64Reg::X3); SUB(ARM64Reg::X4, ARM64Reg::X3, 11); - MOVI2R(ARM64Reg::X2, 0x00C0'0000'0000'0000); - LSLV(ARM64Reg::X4, ARM64Reg::X1, ARM64Reg::X4); - SUB(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 52)); - AND(ARM64Reg::X3, ARM64Reg::X4, LogicalImm(Common::DOUBLE_FRAC - 1, 64)); - ORR(ARM64Reg::X1, ARM64Reg::X2, ARM64Reg::X3); + MOVI2R(ARM64Reg::X2, 12); + LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X4); + SUB(ARM64Reg::X3, ARM64Reg::X2, ARM64Reg::X3); + BFI(ARM64Reg::X1, ARM64Reg::X3, 52, 12); SetJumpTarget(normal); UBFX(ARM64Reg::X2, ARM64Reg::X1, 48, 5); From 9807cf0b8293c796710d238c4425ab15c374d2c5 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 11 Oct 2023 20:17:08 +0200 Subject: [PATCH 4/6] JitArm64: Stop using X4 in frsqrte This required a change in the denormal path where, instead of subtracting 11 before shifting left, we shift left immediately and then shift right by 11. This shouldn't affect performance. --- .../JitArm64/JitArm64_FloatingPoint.cpp | 4 +-- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 4 +-- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 26 +++++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 54ba054bce..48cdc50849 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -685,7 +685,7 @@ void JitArm64::frsqrtex(UGeckoInstruction inst) const ARM64Reg VB = fpr.R(b, RegType::LowerPair); const ARM64Reg VD = fpr.RW(d, RegType::LowerPair); - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30); m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB)); m_float_emit.FRSQRTE(ARM64Reg::D0, EncodeRegToDouble(VB)); @@ -696,7 +696,7 @@ void JitArm64::frsqrtex(UGeckoInstruction inst) SetFPRFIfNeeded(false, ARM64Reg::X0); - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30); fpr.Unlock(ARM64Reg::Q0); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index a79bf83cb0..d4202875ee 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -514,7 +514,7 @@ void JitArm64::ps_rsqrte(UGeckoInstruction inst) const u32 b = inst.FB; const u32 d = inst.FD; - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30); fpr.Lock(ARM64Reg::Q0); const ARM64Reg VB = fpr.R(b, RegType::Register); @@ -529,7 +529,7 @@ void JitArm64::ps_rsqrte(UGeckoInstruction inst) BL(GetAsmRoutines()->frsqrte); m_float_emit.INS(64, EncodeRegToQuad(VD), 1, ARM64Reg::X0); - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W30); fpr.Unlock(ARM64Reg::Q0); fpr.FixSinglePrecision(d); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index fd1693f62a..a88ab47b18 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -311,7 +311,7 @@ void JitArm64::GenerateFres() } // Input: X1 contains input, and D0 contains result of running the input through AArch64 FRSQRTE. -// Output in X0 and memory (PPCState). Clobbers X0-X4 and flags. +// Output in X0 and memory (PPCState). Clobbers X0-X3 and flags. void JitArm64::GenerateFrsqrte() { // The idea behind this implementation: AArch64's frsqrte instruction calculates the exponent and @@ -332,16 +332,16 @@ void JitArm64::GenerateFrsqrte() // "Normalize" denormal values CLZ(ARM64Reg::X3, ARM64Reg::X3); - SUB(ARM64Reg::X4, ARM64Reg::X3, 11); MOVI2R(ARM64Reg::X2, 12); - LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X4); + LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3); + LSR(ARM64Reg::X1, ARM64Reg::X1, 11); SUB(ARM64Reg::X3, ARM64Reg::X2, ARM64Reg::X3); BFI(ARM64Reg::X1, ARM64Reg::X3, 52, 12); SetJumpTarget(normal); UBFX(ARM64Reg::X2, ARM64Reg::X1, 48, 5); - MOVP2R(ARM64Reg::X4, &Common::frsqrte_expected); - ADD(ARM64Reg::X2, ARM64Reg::X4, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X3, &Common::frsqrte_expected); + ADD(ARM64Reg::X2, ARM64Reg::X3, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); LDP(IndexType::Signed, ARM64Reg::W3, ARM64Reg::W2, ARM64Reg::X2, 0); UBFX(ARM64Reg::X1, ARM64Reg::X1, 37, 11); AND(ARM64Reg::X0, ARM64Reg::X0, LogicalImm(Common::DOUBLE_SIGN | Common::DOUBLE_EXP, 64)); @@ -350,10 +350,10 @@ void JitArm64::GenerateFrsqrte() RET(); SetJumpTarget(zero); - LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); - FixupBranch skip_set_zx = TBNZ(ARM64Reg::W4, 26); - ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2); - STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); + LDR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr)); + FixupBranch skip_set_zx = TBNZ(ARM64Reg::W3, 26); + ORRI2R(ARM64Reg::W3, ARM64Reg::W3, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2); + STR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr)); SetJumpTarget(skip_set_zx); RET(); @@ -363,10 +363,10 @@ void JitArm64::GenerateFrsqrte() FixupBranch nan_or_positive_inf = B(CCFlags::CC_NEQ); SetJumpTarget(negative); - LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); - FixupBranch skip_set_vxsqrt = TBNZ(ARM64Reg::W4, 9); - ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_VXSQRT, ARM64Reg::W2); - STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); + LDR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr)); + FixupBranch skip_set_vxsqrt = TBNZ(ARM64Reg::W3, 9); + ORRI2R(ARM64Reg::W3, ARM64Reg::W3, FPSCR_FX | FPSCR_VXSQRT, ARM64Reg::W2); + STR(IndexType::Unsigned, ARM64Reg::W3, PPC_REG, PPCSTATE_OFF(fpscr)); SetJumpTarget(skip_set_vxsqrt); SetJumpTarget(nan_or_positive_inf); RET(); From 9b21046dfcb9d1d089a09e5fd949829b56bc2472 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 11 Oct 2023 20:54:17 +0200 Subject: [PATCH 5/6] JitArm64: Read X1 instead of X3 in frsqrte denormal path If we hit the denormal path, the sign and exponent are guaranteed to be 0, which means X1 and X3 have the same value. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index a88ab47b18..d57743d76a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -327,11 +327,10 @@ void JitArm64::GenerateFrsqrte() CMP(ARM64Reg::X2, ARM64Reg::X3); FixupBranch nan_or_inf = B(CCFlags::CC_EQ); FixupBranch negative = TBNZ(ARM64Reg::X1, 63); - AND(ARM64Reg::X3, ARM64Reg::X1, LogicalImm(Common::DOUBLE_FRAC, 64)); FixupBranch normal = CBNZ(ARM64Reg::X2); // "Normalize" denormal values - CLZ(ARM64Reg::X3, ARM64Reg::X3); + CLZ(ARM64Reg::X3, ARM64Reg::X1); MOVI2R(ARM64Reg::X2, 12); LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3); LSR(ARM64Reg::X1, ARM64Reg::X1, 11); From 9cc1df6c14a757ad9a78e66c0c4e30e89b867f7f Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 11 Oct 2023 21:48:41 +0200 Subject: [PATCH 6/6] JitArm64: Optimize frsqrte denormal path (in an "interesting" way) --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index d57743d76a..1b5b1e1207 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -329,12 +329,12 @@ void JitArm64::GenerateFrsqrte() FixupBranch negative = TBNZ(ARM64Reg::X1, 63); FixupBranch normal = CBNZ(ARM64Reg::X2); - // "Normalize" denormal values + // "Normalize" denormal values. + // The simplified calculation used here results in the upper 11 bits being incorrect, + // but that's fine, because the code below never reads those bits. CLZ(ARM64Reg::X3, ARM64Reg::X1); - MOVI2R(ARM64Reg::X2, 12); LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3); LSR(ARM64Reg::X1, ARM64Reg::X1, 11); - SUB(ARM64Reg::X3, ARM64Reg::X2, ARM64Reg::X3); BFI(ARM64Reg::X1, ARM64Reg::X3, 52, 12); SetJumpTarget(normal);