From df53b37253e45e195a729882398718f01d2819b9 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Fri, 21 Aug 2015 04:01:58 -0500 Subject: [PATCH] [AArch64] Optimize lfd instructions if possible. If we are going to be using lfd, then chances are it is going to be used in double heavy areas of code. If we only need to load the lower register, then we should also not worry about having to insert in to the low 64bits of the guest register. So add a new flag to the backpatching to handle lfd to directly to the destination register. This gives ~3% performance improvement to Povray. --- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 35 +++++++++++++++---- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 13 +++++-- .../Core/PowerPC/JitArmCommon/BackPatch.h | 1 + 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index a71944479c..c57c13aee6 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -71,9 +71,22 @@ bool JitArm64::DisasmLoadStore(const u8* ptr, u32* flags, ARM64Reg* reg) } else // 64-bit float { - // Real register is in the INS instruction - u32 ins_inst = *(u32*)(ptr + 8); - *reg = (ARM64Reg)(ins_inst & 0x1F); + u32 ldr_reg = inst & 0x1F; + + if (ldr_reg) + { + // Loads directly in to the target register + // No need to dump the flag in to flags here + // The slowmem path always first returns in Q0 + // then moves to the destination register + *reg = (ARM64Reg)(ldr_reg); + } + else + { + // Real register is in the INS instruction + u32 ins_inst = *(u32*)(ptr + 8); + *reg = (ARM64Reg)(ins_inst & 0x1F); + } } *flags |= BackPatchInfo::FLAG_LOAD; return true; @@ -165,9 +178,17 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, } else { - m_float_emit.LDR(64, INDEX_UNSIGNED, Q0, addr, 0); - m_float_emit.REV64(8, D0, D0); - m_float_emit.INS(64, RS, 0, Q0, 0); + if (flags & BackPatchInfo::FLAG_ONLY_LOWER) + { + m_float_emit.LDR(64, INDEX_UNSIGNED, EncodeRegToDouble(RS), addr, 0); + m_float_emit.REV64(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); + } + else + { + m_float_emit.LDR(64, INDEX_UNSIGNED, Q0, addr, 0); + m_float_emit.REV64(8, D0, D0); + m_float_emit.INS(64, RS, 0, Q0, 0); + } } } else if (flags & BackPatchInfo::FLAG_STORE) @@ -217,7 +238,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, handler.addr_reg = addr; handler.gprs = gprs_to_push; handler.fprs = fprs_to_push; - handler.flags = flags; + handler.flags = flags & ~BackPatchInfo::FLAG_ONLY_LOWER; FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_start]; auto handler_loc_iter = m_handler_to_loc.find(handler); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index eb47f91546..ff9b1af996 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -71,12 +71,19 @@ void JitArm64::lfXX(UGeckoInstruction inst) u32 imm_addr = 0; bool is_immediate = false; - // 64 bit loads only load PSR0 - fpr.BindToRegister(inst.FD, flags & BackPatchInfo::FLAG_SIZE_F64, flags & BackPatchInfo::FLAG_SIZE_F64); + bool only_lower = !!(flags & BackPatchInfo::FLAG_SIZE_F64); - ARM64Reg VD = fpr.R(inst.FD, flags & BackPatchInfo::FLAG_SIZE_F64); + fpr.BindToRegister(inst.FD, false, only_lower); + + ARM64Reg VD = fpr.R(inst.FD, only_lower); ARM64Reg addr_reg = W0; + if (!fpr.IsLower(inst.FD)) + only_lower = false; + + if (only_lower) + flags |= BackPatchInfo::FLAG_ONLY_LOWER; + gpr.Lock(W0, W30); fpr.Lock(Q0); diff --git a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h index fc8f96ad26..7714cadef9 100644 --- a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h +++ b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h @@ -18,6 +18,7 @@ struct BackPatchInfo FLAG_SIZE_F64 = (1 << 6), FLAG_REVERSE = (1 << 7), FLAG_EXTEND = (1 << 8), + FLAG_ONLY_LOWER = (1 << 9), }; static u32 GetFlagSize(u32 flags)