[AArch64] Clean up bogus vector FCVT{N,L} instruction usage.

Replace the instruction with the scalar variant FCVT instruction.
FCVT{N,L} 8 cycles latency on the Cortex A57
FCVT has five cycle latency and slightly higher throughput

On the A72 all three of these instructions will have three cycle latency,
While FCVT{N,L} will have half the throughput.
This commit is contained in:
Ryan Houdek 2015-09-01 17:10:53 -05:00
parent 2c68f6bfc5
commit 791c7d5a84
4 changed files with 12 additions and 9 deletions

View File

@ -405,7 +405,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
if (!code_block.m_gqr_modified[gqr] && !GQR(gqr))
{
LDR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(spr[SPR_GQR0]) + gqr * 4);
FixupBranch no_fail = B(CC_EQ);
FixupBranch no_fail = CBZ(W0);
FixupBranch fail = B();
SwitchToFarCode();
SetJumpTarget(fail);

View File

@ -83,7 +83,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
{
m_float_emit.LDR(32, EncodeRegToDouble(RS), X28, addr);
m_float_emit.REV32(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
m_float_emit.FCVTL(64, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
}
else
{
@ -211,7 +211,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
MOVI2R(X30, (u64)&PowerPC::Read_U32);
BLR(X30);
m_float_emit.INS(32, RS, 0, X0);
m_float_emit.FCVTL(64, RS, RS);
m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
}
else
{

View File

@ -335,8 +335,8 @@ void JitArm64::frspx(UGeckoInstruction inst)
ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
ARM64Reg VD = fpr.RW(d, REG_DUP);
m_float_emit.FCVTN(32, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
m_float_emit.FCVTL(64, EncodeRegToDouble(VD), EncodeRegToDouble(VD));
m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
m_float_emit.FCVT(64, 32, EncodeRegToDouble(VD), EncodeRegToDouble(VD));
}
void JitArm64::fcmpx(UGeckoInstruction inst)
@ -441,7 +441,7 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
m_float_emit.MOVI(64, EncodeRegToDouble(V0), 0xFFFF000000000000ULL);
m_float_emit.BIC(16, EncodeRegToDouble(V0), 0x7);
m_float_emit.FCVTN(32, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z);
m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0));
fpr.Unlock(V0);

View File

@ -66,14 +66,14 @@ void JitArm64::psq_l(UGeckoInstruction inst)
ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), X28);
m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg));
m_float_emit.REV32(8, VS, VS);
m_float_emit.FCVTL(64, VS, VS);
}
else
{
m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), X28);
m_float_emit.REV32(8, VS, VS);
m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), EncodeRegToDouble(VS));
}
m_float_emit.FCVTL(64, VS, VS);
}
else
{
@ -86,7 +86,10 @@ void JitArm64::psq_l(UGeckoInstruction inst)
BLR(X30);
VS = fpr.RW(inst.RS, REG_REG);
m_float_emit.FCVTL(64, VS, D0);
if (!inst.W)
m_float_emit.FCVTL(64, VS, D0);
else
m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), D0);
}
if (inst.W)