[AArch64] Optimize lfd instructions if possible.

If we are going to be using lfd, then chances are it is going to be used in double heavy areas of code.
If we only need to load the lower register, then we should also not worry about having to insert in to the low 64bits of the guest register.
So add a new flag to the backpatching to handle lfd to directly to the destination register.
This gives ~3% performance improvement to Povray.
This commit is contained in:
Ryan Houdek 2015-08-21 04:01:58 -05:00
parent 6cb87a9227
commit df53b37253
3 changed files with 39 additions and 10 deletions

View File

@ -71,9 +71,22 @@ bool JitArm64::DisasmLoadStore(const u8* ptr, u32* flags, ARM64Reg* reg)
}
else // 64-bit float
{
// Real register is in the INS instruction
u32 ins_inst = *(u32*)(ptr + 8);
*reg = (ARM64Reg)(ins_inst & 0x1F);
u32 ldr_reg = inst & 0x1F;
if (ldr_reg)
{
// Loads directly in to the target register
// No need to dump the flag in to flags here
// The slowmem path always first returns in Q0
// then moves to the destination register
*reg = (ARM64Reg)(ldr_reg);
}
else
{
// Real register is in the INS instruction
u32 ins_inst = *(u32*)(ptr + 8);
*reg = (ARM64Reg)(ins_inst & 0x1F);
}
}
*flags |= BackPatchInfo::FLAG_LOAD;
return true;
@ -165,9 +178,17 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
}
else
{
m_float_emit.LDR(64, INDEX_UNSIGNED, Q0, addr, 0);
m_float_emit.REV64(8, D0, D0);
m_float_emit.INS(64, RS, 0, Q0, 0);
if (flags & BackPatchInfo::FLAG_ONLY_LOWER)
{
m_float_emit.LDR(64, INDEX_UNSIGNED, EncodeRegToDouble(RS), addr, 0);
m_float_emit.REV64(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
}
else
{
m_float_emit.LDR(64, INDEX_UNSIGNED, Q0, addr, 0);
m_float_emit.REV64(8, D0, D0);
m_float_emit.INS(64, RS, 0, Q0, 0);
}
}
}
else if (flags & BackPatchInfo::FLAG_STORE)
@ -217,7 +238,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
handler.addr_reg = addr;
handler.gprs = gprs_to_push;
handler.fprs = fprs_to_push;
handler.flags = flags;
handler.flags = flags & ~BackPatchInfo::FLAG_ONLY_LOWER;
FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_start];
auto handler_loc_iter = m_handler_to_loc.find(handler);

View File

@ -71,12 +71,19 @@ void JitArm64::lfXX(UGeckoInstruction inst)
u32 imm_addr = 0;
bool is_immediate = false;
// 64 bit loads only load PSR0
fpr.BindToRegister(inst.FD, flags & BackPatchInfo::FLAG_SIZE_F64, flags & BackPatchInfo::FLAG_SIZE_F64);
bool only_lower = !!(flags & BackPatchInfo::FLAG_SIZE_F64);
ARM64Reg VD = fpr.R(inst.FD, flags & BackPatchInfo::FLAG_SIZE_F64);
fpr.BindToRegister(inst.FD, false, only_lower);
ARM64Reg VD = fpr.R(inst.FD, only_lower);
ARM64Reg addr_reg = W0;
if (!fpr.IsLower(inst.FD))
only_lower = false;
if (only_lower)
flags |= BackPatchInfo::FLAG_ONLY_LOWER;
gpr.Lock(W0, W30);
fpr.Lock(Q0);

View File

@ -18,6 +18,7 @@ struct BackPatchInfo
FLAG_SIZE_F64 = (1 << 6),
FLAG_REVERSE = (1 << 7),
FLAG_EXTEND = (1 << 8),
FLAG_ONLY_LOWER = (1 << 9),
};
static u32 GetFlagSize(u32 flags)