From 4dbf0b8e904ad3a338809ff1df4ba87836fffebe Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 19 Oct 2022 19:35:10 +0200 Subject: [PATCH] JitArm64: Reimplement Force25BitPrecision The previous implementation of Force25BitPrecision was essentially a translation of the x86-64 implementation. It worked, but we can make a more efficient implementation by using an AArch64 instruction I don't believe x86-64 has an equivalent of: URSHR. The latency is the same as before, but the instruction count and register count are both reduced. --- Source/Core/Common/Arm64Emitter.cpp | 37 ++++++++++++++++++- Source/Core/Common/Arm64Emitter.h | 8 +++- Source/Core/Core/PowerPC/JitArm64/Jit.h | 3 +- .../JitArm64/JitArm64_FloatingPoint.cpp | 33 ++++++----------- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 8 +--- 5 files changed, 57 insertions(+), 32 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 2d6531503b..30dbaacd46 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2354,7 +2354,7 @@ void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 imm, u32 opcode, ARM64R void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { - Write32((2 << 30) | (U << 29) | (0x3E << 23) | (imm << 16) | (opcode << 11) | (1 << 10) | + Write32((1 << 30) | (U << 29) | (0x3E << 23) | (imm << 16) | (opcode << 11) | (1 << 10) | (DecodeReg(Rn) << 5) | DecodeReg(Rd)); } @@ -3540,7 +3540,26 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) EmitPermute(size, 0b111, Rd, Rn, Rm); } -// Shift by immediate +// Scalar shift by immediate +void ARM64FloatEmitter::SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + constexpr size_t src_size = 64; + ASSERT_MSG(DYNA_REC, IsDouble(Rd), "Only double registers are supported!"); + ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}", + shift, src_size); + EmitScalarShiftImm(0, src_size | shift, 0b01010, Rd, Rn); +} + +void ARM64FloatEmitter::URSHR(ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + constexpr size_t src_size = 64; + ASSERT_MSG(DYNA_REC, IsDouble(Rd), "Only double registers are supported!"); + ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}", + shift, src_size); + EmitScalarShiftImm(1, src_size * 2 - shift, 0b00100, Rd, Rn); +} + +// Vector shift by immediate void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { SSHLL(src_size, Rd, Rn, shift, false); @@ -3582,6 +3601,13 @@ void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) UXTL(src_size, Rd, Rn, true); } +void ARM64FloatEmitter::SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}", + shift, src_size); + EmitShiftImm(1, 0, src_size | shift, 0b01010, Rd, Rn); +} + void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) { ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}", @@ -3589,6 +3615,13 @@ void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, EmitShiftImm(upper, 0, src_size | shift, 0b10100, Rd, Rn); } +void ARM64FloatEmitter::URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}", + shift, src_size); + EmitShiftImm(1, 1, src_size * 2 - shift, 0b00100, Rd, Rn); +} + void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) { ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}", diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index a180b5282c..caf997a29d 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -1229,9 +1229,15 @@ public: void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); - // Shift by immediate + // Scalar shift by immediate + void SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void URSHR(ARM64Reg Rd, ARM64Reg Rn, u32 shift); + + // Vector shift by immediate + void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 75565b771d..2d0f39b66b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -333,8 +333,7 @@ protected: bool Rc = false); void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg); - void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input, - Arm64Gen::ARM64Reg temp); + void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input); // std::map m_fault_to_handler; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 40bad6ee03..260da2d900 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -45,24 +45,18 @@ void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg) // Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before // a single precision multiply. To be precise, it drops the low 28 bits of the mantissa, // rounding to nearest as it does. -void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input, ARM64Reg temp) +void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input) { - ASSERT(output != input && output != temp && input != temp); - - // temp = 0x0000'0000'0800'0000ULL - // output = 0xFFFF'FFFF'F800'0000ULL - m_float_emit.MOVI(32, temp, 0x08, 24); - m_float_emit.MOVI(64, output, 0xFFFF'FFFF'0000'0000ULL); - m_float_emit.BIC(temp, temp, output); - m_float_emit.ORR(32, output, 0xF8, 24); - - // output = (input & ~0xFFFFFFF) + ((input & (1ULL << 27)) << 1) - m_float_emit.AND(temp, input, temp); - m_float_emit.AND(output, input, output); if (IsQuad(input)) - m_float_emit.ADD(64, output, output, temp); + { + m_float_emit.URSHR(64, output, input, 28); + m_float_emit.SHL(64, output, output, 28); + } else - m_float_emit.ADD(output, output, temp); + { + m_float_emit.URSHR(output, input, 28); + m_float_emit.SHL(output, output, 28); + } } void JitArm64::fp_arith(UGeckoInstruction inst) @@ -113,9 +107,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst) ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); V0Q = fpr.GetReg(); - V1Q = fpr.GetReg(); - Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q)); + Force25BitPrecision(reg_encoder(V0Q), VC); VC = reg_encoder(V0Q); } @@ -160,18 +153,16 @@ void JitArm64::fp_arith(UGeckoInstruction inst) { ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); - V0Q = fpr.GetReg(); V1Q = fpr.GetReg(); - Force25BitPrecision(reg_encoder(V1Q), VC, reg_encoder(V0Q)); + Force25BitPrecision(reg_encoder(V1Q), VC); VC = reg_encoder(V1Q); } ARM64Reg inaccurate_fma_temp_reg = VD; if (inaccurate_fma && d == b) { - if (V0Q == ARM64Reg::INVALID_REG) - V0Q = fpr.GetReg(); + V0Q = fpr.GetReg(); inaccurate_fma_temp_reg = reg_encoder(V0Q); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 7682679044..0d786f80a0 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -103,12 +103,9 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst) ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); V0Q = fpr.GetReg(); - const ARM64Reg V1Q = fpr.GetReg(); - Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC), reg_encoder(V1Q)); + Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC)); VC = reg_encoder(V0Q); - - fpr.Unlock(V1Q); } m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0); @@ -165,10 +162,9 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) { ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); - allocate_v0_if_needed(); V1Q = fpr.GetReg(); - Force25BitPrecision(reg_encoder(V1Q), VC, V0); + Force25BitPrecision(reg_encoder(V1Q), VC); VC = reg_encoder(V1Q); }