[AArch64] Clean up bogus vector FCVT{N,L} instruction usage.
Replace the instruction with the scalar variant FCVT instruction. FCVT{N,L} 8 cycles latency on the Cortex A57 FCVT has five cycle latency and slightly higher throughput On the A72 all three of these instructions will have three cycle latency, While FCVT{N,L} will have half the throughput.
This commit is contained in:
parent
2c68f6bfc5
commit
791c7d5a84
|
@ -405,7 +405,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
|
|||
if (!code_block.m_gqr_modified[gqr] && !GQR(gqr))
|
||||
{
|
||||
LDR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(spr[SPR_GQR0]) + gqr * 4);
|
||||
FixupBranch no_fail = B(CC_EQ);
|
||||
FixupBranch no_fail = CBZ(W0);
|
||||
FixupBranch fail = B();
|
||||
SwitchToFarCode();
|
||||
SetJumpTarget(fail);
|
||||
|
|
|
@ -83,7 +83,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
|
|||
{
|
||||
m_float_emit.LDR(32, EncodeRegToDouble(RS), X28, addr);
|
||||
m_float_emit.REV32(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
|
||||
m_float_emit.FCVTL(64, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
|
||||
m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -211,7 +211,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
|
|||
MOVI2R(X30, (u64)&PowerPC::Read_U32);
|
||||
BLR(X30);
|
||||
m_float_emit.INS(32, RS, 0, X0);
|
||||
m_float_emit.FCVTL(64, RS, RS);
|
||||
m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -335,8 +335,8 @@ void JitArm64::frspx(UGeckoInstruction inst)
|
|||
ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
|
||||
ARM64Reg VD = fpr.RW(d, REG_DUP);
|
||||
|
||||
m_float_emit.FCVTN(32, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
|
||||
m_float_emit.FCVTL(64, EncodeRegToDouble(VD), EncodeRegToDouble(VD));
|
||||
m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
|
||||
m_float_emit.FCVT(64, 32, EncodeRegToDouble(VD), EncodeRegToDouble(VD));
|
||||
}
|
||||
|
||||
void JitArm64::fcmpx(UGeckoInstruction inst)
|
||||
|
@ -441,7 +441,7 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
|
|||
m_float_emit.MOVI(64, EncodeRegToDouble(V0), 0xFFFF000000000000ULL);
|
||||
m_float_emit.BIC(16, EncodeRegToDouble(V0), 0x7);
|
||||
|
||||
m_float_emit.FCVTN(32, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
|
||||
m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
|
||||
m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z);
|
||||
m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0));
|
||||
fpr.Unlock(V0);
|
||||
|
|
|
@ -66,14 +66,14 @@ void JitArm64::psq_l(UGeckoInstruction inst)
|
|||
ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), X28);
|
||||
m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg));
|
||||
m_float_emit.REV32(8, VS, VS);
|
||||
m_float_emit.FCVTL(64, VS, VS);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), X28);
|
||||
m_float_emit.REV32(8, VS, VS);
|
||||
|
||||
m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), EncodeRegToDouble(VS));
|
||||
}
|
||||
m_float_emit.FCVTL(64, VS, VS);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -86,7 +86,10 @@ void JitArm64::psq_l(UGeckoInstruction inst)
|
|||
BLR(X30);
|
||||
|
||||
VS = fpr.RW(inst.RS, REG_REG);
|
||||
m_float_emit.FCVTL(64, VS, D0);
|
||||
if (!inst.W)
|
||||
m_float_emit.FCVTL(64, VS, D0);
|
||||
else
|
||||
m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), D0);
|
||||
}
|
||||
|
||||
if (inst.W)
|
||||
|
|
Loading…
Reference in New Issue