JitArm64: Reimplement Force25BitPrecision

The previous implementation of Force25BitPrecision was essentially a
translation of the x86-64 implementation. It worked, but we can make a
more efficient implementation by using an AArch64 instruction I don't
believe x86-64 has an equivalent of: URSHR. The latency is the same as
before, but the instruction count and register count are both reduced.
This commit is contained in:
JosJuice 2022-10-19 19:35:10 +02:00
parent bba38a3642
commit 4dbf0b8e90
5 changed files with 57 additions and 32 deletions

View File

@ -2354,7 +2354,7 @@ void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 imm, u32 opcode, ARM64R
void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
{ {
Write32((2 << 30) | (U << 29) | (0x3E << 23) | (imm << 16) | (opcode << 11) | (1 << 10) | Write32((1 << 30) | (U << 29) | (0x3E << 23) | (imm << 16) | (opcode << 11) | (1 << 10) |
(DecodeReg(Rn) << 5) | DecodeReg(Rd)); (DecodeReg(Rn) << 5) | DecodeReg(Rd));
} }
@ -3540,7 +3540,26 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
EmitPermute(size, 0b111, Rd, Rn, Rm); EmitPermute(size, 0b111, Rd, Rn, Rm);
} }
// Shift by immediate // Scalar shift by immediate
void ARM64FloatEmitter::SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
constexpr size_t src_size = 64;
ASSERT_MSG(DYNA_REC, IsDouble(Rd), "Only double registers are supported!");
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}",
shift, src_size);
EmitScalarShiftImm(0, src_size | shift, 0b01010, Rd, Rn);
}
void ARM64FloatEmitter::URSHR(ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
constexpr size_t src_size = 64;
ASSERT_MSG(DYNA_REC, IsDouble(Rd), "Only double registers are supported!");
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}",
shift, src_size);
EmitScalarShiftImm(1, src_size * 2 - shift, 0b00100, Rd, Rn);
}
// Vector shift by immediate
void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{ {
SSHLL(src_size, Rd, Rn, shift, false); SSHLL(src_size, Rd, Rn, shift, false);
@ -3582,6 +3601,13 @@ void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
UXTL(src_size, Rd, Rn, true); UXTL(src_size, Rd, Rn, true);
} }
void ARM64FloatEmitter::SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}",
shift, src_size);
EmitShiftImm(1, 0, src_size | shift, 0b01010, Rd, Rn);
}
void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
{ {
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}", ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
@ -3589,6 +3615,13 @@ void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift,
EmitShiftImm(upper, 0, src_size | shift, 0b10100, Rd, Rn); EmitShiftImm(upper, 0, src_size | shift, 0b10100, Rd, Rn);
} }
void ARM64FloatEmitter::URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must less than the element size! {} {}",
shift, src_size);
EmitShiftImm(1, 1, src_size * 2 - shift, 0b00100, Rd, Rn);
}
void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
{ {
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}", ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",

View File

@ -1229,9 +1229,15 @@ public:
void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
// Shift by immediate // Scalar shift by immediate
void SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void URSHR(ARM64Reg Rd, ARM64Reg Rn, u32 shift);
// Vector shift by immediate
void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);

View File

@ -333,8 +333,7 @@ protected:
bool Rc = false); bool Rc = false);
void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg); void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg);
void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input, void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input);
Arm64Gen::ARM64Reg temp);
// <Fastmem fault location, slowmem handler location> // <Fastmem fault location, slowmem handler location>
std::map<const u8*, FastmemArea> m_fault_to_handler; std::map<const u8*, FastmemArea> m_fault_to_handler;

View File

@ -45,24 +45,18 @@ void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg)
// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before // Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa, // a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
// rounding to nearest as it does. // rounding to nearest as it does.
void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input, ARM64Reg temp) void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input)
{ {
ASSERT(output != input && output != temp && input != temp);
// temp = 0x0000'0000'0800'0000ULL
// output = 0xFFFF'FFFF'F800'0000ULL
m_float_emit.MOVI(32, temp, 0x08, 24);
m_float_emit.MOVI(64, output, 0xFFFF'FFFF'0000'0000ULL);
m_float_emit.BIC(temp, temp, output);
m_float_emit.ORR(32, output, 0xF8, 24);
// output = (input & ~0xFFFFFFF) + ((input & (1ULL << 27)) << 1)
m_float_emit.AND(temp, input, temp);
m_float_emit.AND(output, input, output);
if (IsQuad(input)) if (IsQuad(input))
m_float_emit.ADD(64, output, output, temp); {
m_float_emit.URSHR(64, output, input, 28);
m_float_emit.SHL(64, output, output, 28);
}
else else
m_float_emit.ADD(output, output, temp); {
m_float_emit.URSHR(output, input, 28);
m_float_emit.SHL(output, output, 28);
}
} }
void JitArm64::fp_arith(UGeckoInstruction inst) void JitArm64::fp_arith(UGeckoInstruction inst)
@ -113,9 +107,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
V0Q = fpr.GetReg(); V0Q = fpr.GetReg();
V1Q = fpr.GetReg();
Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q)); Force25BitPrecision(reg_encoder(V0Q), VC);
VC = reg_encoder(V0Q); VC = reg_encoder(V0Q);
} }
@ -160,17 +153,15 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
{ {
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
V0Q = fpr.GetReg();
V1Q = fpr.GetReg(); V1Q = fpr.GetReg();
Force25BitPrecision(reg_encoder(V1Q), VC, reg_encoder(V0Q)); Force25BitPrecision(reg_encoder(V1Q), VC);
VC = reg_encoder(V1Q); VC = reg_encoder(V1Q);
} }
ARM64Reg inaccurate_fma_temp_reg = VD; ARM64Reg inaccurate_fma_temp_reg = VD;
if (inaccurate_fma && d == b) if (inaccurate_fma && d == b)
{ {
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetReg(); V0Q = fpr.GetReg();
inaccurate_fma_temp_reg = reg_encoder(V0Q); inaccurate_fma_temp_reg = reg_encoder(V0Q);

View File

@ -103,12 +103,9 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst)
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
V0Q = fpr.GetReg(); V0Q = fpr.GetReg();
const ARM64Reg V1Q = fpr.GetReg();
Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC), reg_encoder(V1Q)); Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC));
VC = reg_encoder(V0Q); VC = reg_encoder(V0Q);
fpr.Unlock(V1Q);
} }
m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0); m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0);
@ -165,10 +162,9 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
{ {
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
allocate_v0_if_needed();
V1Q = fpr.GetReg(); V1Q = fpr.GetReg();
Force25BitPrecision(reg_encoder(V1Q), VC, V0); Force25BitPrecision(reg_encoder(V1Q), VC);
VC = reg_encoder(V1Q); VC = reg_encoder(V1Q);
} }