From 6aa54a029edd1b8d63db012fa119e75ebf5c3aa6 Mon Sep 17 00:00:00 2001 From: degasus Date: Thu, 9 Feb 2017 09:25:31 +0100 Subject: [PATCH 1/2] JitArm64: Optimize GPR register push/pop. --- Source/Core/Common/Arm64Emitter.cpp | 120 +++++++++------------------- 1 file changed, 36 insertions(+), 84 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index a1bb5ff630..637f7d06c2 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2079,106 +2079,58 @@ bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2) void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers) { - unsigned int num_regs = registers.Count(); + int num_regs = registers.Count(); + int stack_size = (num_regs + (num_regs & 1)) * 8; + auto it = registers.begin(); - if (num_regs % 2) - { - bool first = true; + if (!num_regs) + return; - // Stack is required to be quad-word aligned. - u32 stack_size = Common::AlignUp(num_regs * 8, 16); - u32 current_offset = 0; - std::vector reg_pair; + // 8 byte per register, but 16 byte alignment, so we may have to padd one register. + // Only update the SP on the last write to avoid the dependency between those stores. - for (auto it : registers) - { - if (first) - { - STR(INDEX_PRE, (ARM64Reg)(X0 + it), SP, -(s32)stack_size); - first = false; - current_offset += 16; - } - else - { - reg_pair.push_back((ARM64Reg)(X0 + it)); - if (reg_pair.size() == 2) - { - STP(INDEX_SIGNED, reg_pair[0], reg_pair[1], SP, current_offset); - reg_pair.clear(); - current_offset += 16; - } - } - } - } + // The first push must adjust the SP, else a context switch may invalidate everything below SP. + if (num_regs & 1) + STR(INDEX_PRE, (ARM64Reg)(X0 + *it++), SP, -stack_size); else - { - std::vector reg_pair; + STP(INDEX_PRE, (ARM64Reg)(X0 + *it++), (ARM64Reg)(X0 + *it++), SP, -stack_size); - for (auto it : registers) - { - reg_pair.push_back((ARM64Reg)(X0 + it)); - if (reg_pair.size() == 2) - { - STP(INDEX_PRE, reg_pair[0], reg_pair[1], SP, -16); - reg_pair.clear(); - } - } - } + // Fast store for all other registers, this is always an even number. + for (int i = 0; i < (num_regs - 1) / 2; i++) + STP(INDEX_SIGNED, (ARM64Reg)(X0 + *it++), (ARM64Reg)(X0 + *it++), SP, 16 * (i + 1)); + + _assert_msg_(DYNA_REC, it == registers.end(), "%s registers don't match.", __FUNCTION__); } void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask) { int num_regs = registers.Count(); + int stack_size = (num_regs + (num_regs & 1)) * 8; + auto it = registers.begin(); - if (num_regs % 2) - { - bool first = true; + if (!num_regs) + return; - std::vector reg_pair; + // We must adjust the SP in the end, so load the first (two) registers at least. + ARM64Reg first = (ARM64Reg)(X0 + *it++); + ARM64Reg second; + if (!(num_regs & 1)) + second = (ARM64Reg)(X0 + *it++); - for (auto it : registers) - { - if (ignore_mask[it]) - it = WSP; + // 8 byte per register, but 16 byte alignment, so we may have to padd one register. + // Only update the SP on the last load to avoid the dependency between those loads. - if (first) - { - LDR(INDEX_POST, (ARM64Reg)(X0 + it), SP, 16); - first = false; - } - else - { - reg_pair.push_back((ARM64Reg)(X0 + it)); - if (reg_pair.size() == 2) - { - LDP(INDEX_POST, reg_pair[0], reg_pair[1], SP, 16); - reg_pair.clear(); - } - } - } - } + // Fast load for all but the first (two) registers, this is always an even number. + for (int i = 0; i < (num_regs - 1) / 2; i++) + LDP(INDEX_SIGNED, (ARM64Reg)(X0 + *it++), (ARM64Reg)(X0 + *it++), SP, 16 * (i + 1)); + + // Post loading the first (two) registers. + if (num_regs & 1) + LDR(INDEX_POST, first, SP, stack_size); else - { - std::vector reg_pair; + LDP(INDEX_POST, first, second, SP, stack_size); - for (int i = 31; i >= 0; --i) - { - if (!registers[i]) - continue; - - int reg = i; - - if (ignore_mask[reg]) - reg = WSP; - - reg_pair.push_back((ARM64Reg)(X0 + reg)); - if (reg_pair.size() == 2) - { - LDP(INDEX_POST, reg_pair[1], reg_pair[0], SP, 16); - reg_pair.clear(); - } - } - } + _assert_msg_(DYNA_REC, it == registers.end(), "%s registers don't match.", __FUNCTION__); } // Float Emitter From 8829af62cb57a5cc2f5d3185065f9b6bb5c7ee0e Mon Sep 17 00:00:00 2001 From: degasus Date: Fri, 10 Feb 2017 00:21:26 +0100 Subject: [PATCH 2/2] JitArm64: Fix for stack push/pop ABI. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 3 +++ Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 50cd4f908f..1931369319 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -753,6 +753,7 @@ void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* gpr.Lock(W30); BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); regs_in_use[W30] = 0; FixupBranch Exception = B(); @@ -761,8 +762,10 @@ void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* FixupBranch exit = B(); SetJumpTarget(Exception); ABI_PushRegisters(regs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use, X30); MOVP2R(X30, &GPFifo::FastCheckGatherPipe); BLR(X30); + m_float_emit.ABI_PopRegisters(fprs_in_use, X30); ABI_PopRegisters(regs_in_use); // Inline exception check diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index bd35f2ace4..700a43ae99 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -19,12 +19,15 @@ using namespace Arm64Gen; void JitArm64::GenerateAsm() { // This value is all of the callee saved registers that we are required to save. - // According to the AACPS64 we need to save R19 ~ R30. + // According to the AACPS64 we need to save R19 ~ R30 and Q8 ~ Q15. const u32 ALL_CALLEE_SAVED = 0x7FF80000; + const u32 ALL_CALLEE_SAVED_FPR = 0x0000FF00; BitSet32 regs_to_save(ALL_CALLEE_SAVED); + BitSet32 regs_to_save_fpr(ALL_CALLEE_SAVED_FPR); enterCode = GetCodePtr(); ABI_PushRegisters(regs_to_save); + m_float_emit.ABI_PushRegisters(regs_to_save_fpr, X30); MOVP2R(PPC_REG, &PowerPC::ppcState); @@ -175,6 +178,7 @@ void JitArm64::GenerateAsm() LDR(INDEX_UNSIGNED, X0, X1, 0); ADD(SP, X0, 0); + m_float_emit.ABI_PopRegisters(regs_to_save_fpr, X30); ABI_PopRegisters(regs_to_save); RET(X30);