From ca4fa1ac74e6f83ea91044bfb5a2352b3fd84b5c Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 18 Aug 2024 06:42:20 +0300 Subject: [PATCH] Use return-oriented trap approach --- rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.cpp | 298 ++++++++++ rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.h | 102 ++++ .../Emu/CPU/Backends/AArch64/AArch64Common.h | 39 ++ rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.cpp | 529 ++++++++++++++++++ rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.h | 86 +++ 5 files changed, 1054 insertions(+) create mode 100644 rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.cpp create mode 100644 rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.h create mode 100644 rpcs3/Emu/CPU/Backends/AArch64/AArch64Common.h create mode 100644 rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.cpp create mode 100644 rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.h diff --git a/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.cpp b/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.cpp new file mode 100644 index 0000000000..4c8a01e0e2 --- /dev/null +++ b/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.cpp @@ -0,0 +1,298 @@ +#include "stdafx.h" +#include "AArch64ASM.h" + +#include "Utilities/StrFmt.h" + +namespace aarch64 +{ + using fmt_replacement_list_t = std::vector>; + + void UASM::embed_args(compiled_instruction_t& instruction, const std::vector& args, const std::vector& clobbered) + { + for (const auto& arg : args) + { + switch (arg.type) + { + case ArgType::Immediate: + case ArgType::Register: + case ArgType::SRegister: + // Embedded directly + break; + case ArgType::LLVMInt: + instruction.constraints.push_back("i"); + instruction.args.push_back(arg.value); + break; + case ArgType::LLVMReg: + instruction.constraints.push_back("r"); + instruction.args.push_back(arg.value); + break; + case ArgType::LLVMPtr: + instruction.constraints.push_back("m"); + instruction.args.push_back(arg.value); + break; + default: + break; + } + } + + for (const auto& reg : clobbered) + { + const auto clobber = fmt::format("~{%s}", gpr_names[static_cast(reg)]); + instruction.constraints.push_back(clobber); + } + } + + void UASM::emit0(const char* inst) + { + compiled_instruction_t i{}; + i.asm_ = inst; + m_instructions.push_back(i); + } + + void UASM::emit1(const char* inst, const Arg& arg0, const std::vector& clobbered) + { + int arg_id = 0; + fmt_replacement_list_t repl = { + { "{0}", arg0.to_string(&arg_id) } + }; + + compiled_instruction_t i{}; + i.asm_ = fmt::replace_all(inst, repl); + embed_args(i, { arg0 }, clobbered); + m_instructions.push_back(i); + } + + void UASM::emit2(const char* inst, const Arg& arg0, const Arg& arg1, const std::vector& clobbered) + { + int arg_id = 0; + fmt_replacement_list_t repl = { + { "{0}", arg0.to_string(&arg_id) }, + { "{1}", arg1.to_string(&arg_id) }, + }; + + compiled_instruction_t i{}; + i.asm_ = fmt::replace_all(inst, repl); + embed_args(i, { arg0, arg1 }, clobbered); + m_instructions.push_back(i); + } + + void UASM::emit3(const char* inst, const Arg& arg0, const Arg& arg1, const Arg& arg2, const std::vector& clobbered) + { + int arg_id = 0; + fmt_replacement_list_t repl = { + { "{0}", arg0.to_string(&arg_id) }, + { "{1}", arg1.to_string(&arg_id) }, + { "{2}", arg2.to_string(&arg_id) }, + }; + + compiled_instruction_t i{}; + i.asm_ = fmt::replace_all(inst, repl); + embed_args(i, { arg0, arg1, arg2 }, clobbered); + m_instructions.push_back(i); + } + + void UASM::emit4(const char* inst, const Arg& arg0, const Arg& arg1, const Arg& arg2, const Arg& arg3, const std::vector& clobbered) + { + int arg_id = 0; + fmt_replacement_list_t repl = { + { "{0}", arg0.to_string(&arg_id) }, + { "{1}", arg1.to_string(&arg_id) }, + { "{2}", arg2.to_string(&arg_id) }, + { "{3}", arg3.to_string(&arg_id) }, + }; + + compiled_instruction_t i{}; + i.asm_ = fmt::replace_all(inst, repl); + embed_args(i, { arg0, arg1, arg2, arg3 }, clobbered); + m_instructions.push_back(i); + } + + void UASM::insert(llvm::IRBuilder<>* irb, llvm::LLVMContext& ctx) const + { + for (const auto& inst : m_instructions) + { + auto constraints = fmt::merge(inst.constraints, ","); + llvm_asm(irb, inst.asm_, inst.args, constraints, ctx); + } + } + + void UASM::append(const UASM& that) + { + m_instructions.reserve(m_instructions.size() + that.m_instructions.size()); + std::copy(that.m_instructions.begin(), that.m_instructions.end(), std::back_inserter(this->m_instructions)); + } + + void UASM::prepend(const UASM& that) + { + auto new_instructions = that.m_instructions; + new_instructions.reserve(m_instructions.size() + that.m_instructions.size()); + std::copy(m_instructions.begin(), m_instructions.end(), std::back_inserter(new_instructions)); + m_instructions = std::move(new_instructions); + } + + // Convenience arg wrappers + UASM::Arg UASM::Int(llvm::Value* v) + { + return Arg { + .type = ArgType::LLVMInt, + .value = v + }; + } + + UASM::Arg UASM::Imm(s64 v) + { + return Arg { + .type = ArgType::Immediate, + .imm = v + }; + } + + UASM::Arg UASM::Reg(gpr reg) + { + return Arg { + .type = ArgType::Register, + .reg = reg + }; + } + + UASM::Arg UASM::Reg(spr reg) + { + return Arg { + .type = ArgType::SRegister, + .sreg = reg + }; + } + + UASM::Arg UASM::Ptr(llvm::Value* v) + { + return Arg { + .type = ArgType::LLVMPtr, + .value = v + }; + } + + UASM::Arg UASM::Var(llvm::Value* v) + { + return Arg { + .type = ArgType::LLVMReg, + .value = v + }; + } + + void UASM::mov(gpr dst, gpr src) + { + emit2("mov {0}, {1}", Reg(dst), Reg(src), { dst }); + } + + void UASM::mov(gpr dst, const Arg& src) + { + emit2("mov {0}, {1}", Reg(dst), src, { dst }); + } + + void UASM::movnt(gpr dst, const Arg& src) + { + emit2("mov {0}, {1}", Reg(dst), src, {}); + } + + void UASM::str(gpr src, gpr base, const Arg& offset) + { + emit3("str {0}, [{1}, {2}]", Reg(src), Reg(base), offset, {}); + } + + void UASM::str(const Arg& src, spr base, const Arg& offset) + { + emit3("str {0}, [{1}, {2}]", src, Reg(base), offset, {}); + } + + void UASM::ldr(gpr dst, gpr base, const Arg& offset) + { + emit3("ldr {0}, [{1}, {2}]", Reg(dst), Reg(base), offset, { dst }); + } + + void UASM::ldr(gpr dst, spr base, const Arg& offset) + { + emit3("ldr {0}, [{1}, {2}]", Reg(dst), Reg(base), offset, { dst }); + } + + void UASM::stp(gpr src0, gpr src1, gpr base, const Arg& offset) + { + emit4("stp {0}, {1}, [{2}, {3}]", Reg(src0), Reg(src1), Reg(base), offset, {}); + } + + void UASM::ldp(gpr dst0, gpr dst1, gpr base, const Arg& offset) + { + emit4("ldp {0}, {1}, [{2}, {3}]", Reg(dst0), Reg(dst1), Reg(base), offset, { dst0, dst1 }); + } + + void UASM::b(const Arg& target) + { + emit1("b {0}", target, {}); + } + + void UASM::br(gpr target) + { + emit1("br {0}", Reg(target), {}); + } + + void UASM::br(const Arg& target) + { + emit1("br {0}", target, {}); + } + + void UASM::ret() + { + emit0("ret"); + } + + void UASM::adr(gpr dst, const Arg& src) + { + emit2("adr {0}, {1}", Reg(dst), src, { dst }); + } + + void UASM::add(spr dst, spr src0, const Arg& src1) + { + emit3("add {0}, {1}, {2}", Reg(dst), Reg(src0), src1, {}); + } + + void UASM::sub(spr dst, spr src0, const Arg& src1) + { + emit3("sub {0}, {1}, {2}", Reg(dst), Reg(src0), src1, {}); + } + + void UASM::nop(const std::vector& refs) + { + emit0("nop"); + embed_args(m_instructions.back(), refs, {}); + } + + void UASM::brk(int mark) + { + emit1("brk {0}", Imm(mark), {}); + } + + std::string UASM::Arg::to_string(int* id) const + { + // Safety checks around the optional arg incrementer + int dummy = 0; + if (!id) + { + id = &dummy; + } + + switch (type) + { + case ArgType::Immediate: + return std::string("#") + std::to_string(imm); + case ArgType::Register: + return gpr_names[static_cast(reg)]; + case ArgType::SRegister: + return spr_asm_names[static_cast(sreg)]; + case ArgType::LLVMInt: + case ArgType::LLVMReg: + case ArgType::LLVMPtr: + default: + // Return placeholder identifier + return std::string("$") + std::to_string(*id++); + } + } +} diff --git a/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.h b/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.h new file mode 100644 index 0000000000..f494fafc63 --- /dev/null +++ b/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.h @@ -0,0 +1,102 @@ +#pragma once + +#include "AArch64Common.h" + +namespace aarch64 +{ + // Micro-assembler + class UASM + { + public: + enum ArgType + { + Register = 0, + SRegister, + Immediate, + LLVMInt, + LLVMPtr, + LLVMReg + }; + + struct Arg + { + ArgType type; + union + { + llvm::Value* value; + gpr reg; + spr sreg; + s64 imm; + }; + + std::string to_string(int* id = nullptr) const; + }; + + protected: + struct compiled_instruction_t + { + std::string asm_; + std::vector constraints; + std::vector args; + }; + + std::vector m_instructions; + + void emit0(const char* inst); + void emit1(const char* inst, const Arg& arg0, const std::vector& clobbered); + void emit2(const char* inst, const Arg& arg0, const Arg& arg1, const std::vector& clobbered); + void emit3(const char* inst, const Arg& arg0, const Arg& arg1, const Arg& arg2, const std::vector& clobbered); + void emit4(const char* inst, const Arg& arg0, const Arg& arg1, const Arg& arg2, const Arg& arg3, const std::vector& clobbered); + + void embed_args(compiled_instruction_t& instruction, const std::vector& args, const std::vector& clobbered); + + public: + UASM() = default; + + // Convenience wrappers + static Arg Int(llvm::Value* v); + static Arg Imm(s64 v); + static Arg Reg(gpr reg); + static Arg Reg(spr reg); + static Arg Ptr(llvm::Value* v); + static Arg Var(llvm::Value* v); + + void mov(gpr dst, gpr src); + void mov(gpr dst, const Arg& src); + void movnt(gpr dst, const Arg& src); + + void adr(gpr dst, const Arg& src); + + void str(gpr src, gpr base, const Arg& offset); + void str(gpr src, spr base, const Arg& offset); + void str(const Arg& src, gpr base, const Arg& offset); + void str(const Arg& src, spr base, const Arg& offset); + void ldr(gpr dst, gpr base, const Arg& offset); + void ldr(gpr dst, spr base, const Arg& offset); + + void stp(gpr src0, gpr src1, gpr base, const Arg& offset); + void stp(gpr src0, gpr src1, spr base, const Arg& offset); + void ldp(gpr dst0, gpr dst1, gpr base, const Arg& offset); + void ldp(gpr dst0, gpr dst1, spr base, const Arg& offset); + + void add(spr dst, spr src0, const Arg& src1); + void add(gpr dst, gpr src0, const Arg& src1); + void sub(spr dst, spr src0, const Arg& src1); + void sub(gpr dst, gpr src0, const Arg& src1); + + void b(const Arg& target); + void br(gpr target); + void br(const Arg& target); + void ret(); + + void nop(const std::vector& refs = {}); + void brk(int mark = 0); + + void append(const UASM& other); + void prepend(const UASM& other); + + void insert(llvm::IRBuilder<>* irb, llvm::LLVMContext& ctx) const; + }; + + using ASMBlock = UASM; +} diff --git a/rpcs3/Emu/CPU/Backends/AArch64/AArch64Common.h b/rpcs3/Emu/CPU/Backends/AArch64/AArch64Common.h new file mode 100644 index 0000000000..9b77e5eb36 --- /dev/null +++ b/rpcs3/Emu/CPU/Backends/AArch64/AArch64Common.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include "../../CPUTranslator.h" + +namespace aarch64 +{ + enum gpr : s32 + { + x0 = 0, + x1, x2, x3, x4, x5, x6, x7, x8, x9, + x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, + x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30 + }; + + enum spr : s32 + { + xzr = 0, + pc, + sp + }; + + static const char* gpr_names[] = + { + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", + "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", + "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30" + }; + + static const char* spr_names[] = + { + "xzr", "pc", "sp" + }; + + static const char* spr_asm_names[] = + { + "xzr", ".", "sp" + }; +} diff --git a/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.cpp b/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.cpp new file mode 100644 index 0000000000..6235aa4ea4 --- /dev/null +++ b/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.cpp @@ -0,0 +1,529 @@ +#include "stdafx.h" + + +#include "AArch64JIT.h" +#include "AArch64ASM.h" + +LOG_CHANNEL(jit_log, "JIT"); + +#define STDOUT_DEBUG 0 + +#define DPRINT1(...)\ + do {\ + printf(__VA_ARGS__);\ + printf("\n");\ + fflush(stdout);\ + } while (0) + +#if STDOUT_DEBUG +#define DPRINT DPRINT1 +#else +#define DPRINT jit_log.trace +#endif + +namespace aarch64 +{ + using instruction_info_t = GHC_frame_preservation_pass::instruction_info_t; + using function_info_t = GHC_frame_preservation_pass::function_info_t; + + GHC_frame_preservation_pass::GHC_frame_preservation_pass(const config_t& configuration) + : m_config(configuration) + {} + + void GHC_frame_preservation_pass::reset() + { + m_visited_functions.clear(); + } + + void GHC_frame_preservation_pass::force_tail_call_terminators(llvm::Function& f) + { + // GHC functions are not call-stack preserving and can therefore never return if they make any external calls at all. + // Replace every terminator clause with a tail call explicitly. This is already done for X64 to work, but better safe than sorry. + for (auto& bb : f) + { + auto bit = bb.begin(), prev = bb.end(); + for (; bit != bb.end(); prev = bit, ++bit) + { + if (prev == bb.end()) + { + continue; + } + + if (llvm::isa(&*bit)) + { + if (auto ci = llvm::dyn_cast(&*prev)) + { + // This is a "ret" that is coming after a "call" to another funciton. + // Enforce that it must be a tail call. + if (!ci->isTailCall()) + { + ci->setTailCall(); + } + } + } + } + } + } + + function_info_t GHC_frame_preservation_pass::preprocess_function(const llvm::Function& f) + { + function_info_t result{}; + result.instruction_count = f.getInstructionCount(); + + // Blanket exclusions. Stubs or dispatchers that do not compute anything themselves. + if (f.getName() == "__spu-null") + { + // Don't waste the effort processing this stub. It has no points of concern + result.num_external_calls = 1; + return result; + } + + if (m_config.use_stack_frames) + { + // Stack frame estimation. SPU code can be very long and consumes several KB of stack. + u32 stack_frame_size = 128u; + // Actual ratio is usually around 1:4 + const u32 expected_compiled_instr_count = f.getInstructionCount() * 4; + // Because GHC doesn't preserve stack (all stack is scratch), we know we'll start to spill once we go over the number of actual regs. + // We use a naive allocator that just assumes each instruction consumes a register slot. We "spill" every 32 instructions. + // FIXME: Aggressive spill is only really a thing with vector operations. We can detect those instead. + // A proper fix is to port this to a MF pass, but I have PTSD from working at MF level. + const u32 spill_pages = (expected_compiled_instr_count + 127u) / 128u; + stack_frame_size *= std::min(spill_pages, 32u); // 128 to 4k dynamic. It is unlikely that any frame consumes more than 4096 bytes + + result.stack_frame_size = stack_frame_size; + } + + result.instruction_count = f.getInstructionCount(); + result.num_external_calls = 0; + + // The LR is not spared by LLVM in cases where there is a lot of spilling. + // This is much easier to manage with a custom LLVM branch as we can just mark X30 as off-limits as a GPR. + // This is another thing to be moved to a MachineFunction pass. Ideally we should check the instruction stream for writes to LR and reload it on exit. + // For now, assume it is dirtied if the function is of any reasonable length. + result.clobbers_x30 = result.instruction_count > 32; + result.is_leaf = true; + + for (auto& bb : f) + { + for (auto& inst : bb) + { + if (auto ci = llvm::dyn_cast(&inst)) + { + if (llvm::isa(ci->getCalledOperand())) + { + // Inline ASM blocks are ignored + continue; + } + + result.num_external_calls++; + if (ci->isTailCall()) + { + // This is not a leaf if it has at least one exit point / terminator that is not a return instruction. + result.is_leaf = false; + } + else + { + // Returning calls always clobber x30 + result.clobbers_x30 = true; + } + } + } + } + + return result; + } + + instruction_info_t GHC_frame_preservation_pass::decode_instruction(const llvm::Function& f, const llvm::Instruction* i) + { + instruction_info_t result{}; + if (auto ci = llvm::dyn_cast(i)) + { + // Watch out for injected ASM blocks... + if (llvm::isa(ci->getCalledOperand())) + { + // Not a real call. This is just an insert of inline asm + return result; + } + + result.is_call_inst = true; + result.is_returning = true; + result.preserve_stack = !ci->isTailCall(); + result.callee = ci->getCalledFunction(); + result.is_tail_call = ci->isTailCall(); + + if (!result.callee) + { + // Indirect call (call from raw value). + result.is_indirect = true; + result.callee_is_GHC = ci->getCallingConv() == llvm::CallingConv::GHC; + result.callee_name = "__indirect_call"; + } + else + { + result.callee_is_GHC = result.callee->getCallingConv() == llvm::CallingConv::GHC; + result.callee_name = result.callee->getName().str(); + } + return result; + } + + if (auto bi = llvm::dyn_cast(i)) + { + // More likely to jump out via an unconditional... + if (!bi->isConditional()) + { + ensure(bi->getNumSuccessors() == 1); + auto targetbb = bi->getSuccessor(0); + + result.callee = targetbb->getParent(); + result.callee_name = result.callee->getName().str(); + result.is_call_inst = result.callee_name != f.getName(); + } + + return result; + } + + if (auto bi = llvm::dyn_cast(i)) + { + // Very unlikely to be the same function. Can be considered a function exit. + ensure(bi->getNumDestinations() == 1); + auto targetbb = ensure(bi->getSuccessor(0)); // This is guaranteed to fail but I've yet to encounter this + + result.callee = targetbb->getParent(); + result.callee_name = result.callee->getName().str(); + result.is_call_inst = result.callee_name != f.getName(); + return result; + } + + if (auto bi = llvm::dyn_cast(i)) + { + ensure(bi->getNumSuccessors() == 1); + auto targetbb = bi->getSuccessor(0); + + result.callee = targetbb->getParent(); + result.callee_name = result.callee->getName().str(); + result.is_call_inst = result.callee_name != f.getName(); + return result; + } + + if (auto bi = llvm::dyn_cast(i)) + { + ensure(bi->getNumSuccessors() == 2); + auto targetbb = bi->getSuccessor(0); + + result.callee = targetbb->getParent(); + result.callee_name = result.callee->getName().str(); + result.is_call_inst = result.callee_name != f.getName(); + return result; + } + + return result; + } + + gpr GHC_frame_preservation_pass::get_base_register_for_call(const std::string& callee_name, gpr default_reg) + { + // We go over the base_register_lookup table and find the first matching pattern + for (const auto& pattern : m_config.base_register_lookup) + { + if (callee_name.starts_with(pattern.first)) + { + return pattern.second; + } + } + + return default_reg; + } + + void GHC_frame_preservation_pass::run(llvm::IRBuilder<>* irb, llvm::Function& f) + { + if (f.getCallingConv() != llvm::CallingConv::GHC) + { + // If we're not doing GHC, the calling conv will have stack fixup on its own via prologue/epilogue + return; + } + + if (f.getInstructionCount() == 0) + { + // Nothing to do. Happens with placeholder functions such as branch patchpoints + return; + } + + const auto this_name = f.getName().str(); + if (m_visited_functions.find(this_name) != m_visited_functions.end()) + { + // Already processed. Only useful when recursing which is currently not used. + DPRINT("Function %s was already processed. Skipping.\n", this_name.c_str()); + return; + } + + if (this_name != "__spu-null") // This name is meaningless and doesn't uniquely identify a function + { + m_visited_functions.insert(this_name); + } + + if (m_config.exclusion_callback && m_config.exclusion_callback(this_name)) + { + // Function is explicitly excluded + return; + } + + // Preprocessing. + auto function_info = preprocess_function(f); + if (function_info.num_external_calls == 0 && function_info.stack_frame_size == 0) + { + // No stack frame injection and no external calls to patch up. This is a leaf function, nothing to do. + DPRINT("Ignoring function %s", this_name.c_str()); + return; + } + + // Force tail calls on all terminators + force_tail_call_terminators(f); + + // Check for leaves + if (function_info.is_leaf && !m_config.use_stack_frames) + { + // Sanity check. If this function had no returning calls, it should have been omitted from processing. + ensure(function_info.clobbers_x30, "Function has no terminator and no non-tail calls but was allowed for frame processing!"); + DPRINT("Function %s is a leaf.", this_name.c_str()); + process_leaf_function(irb, f); + return; + } + + // Asm snippets for patching stack frame + ASMBlock frame_prologue, frame_epilogue; + + if (function_info.stack_frame_size > 0) + { + // NOTE: The stack frame here is purely optional, we can pre-allocate scratch on the gateway. + // However, that is an optimization for another time, this helps make debugging easier. + frame_prologue.sub(sp, sp, UASM::Imm(function_info.stack_frame_size)); + frame_epilogue.add(sp, sp, UASM::Imm(function_info.stack_frame_size)); + + // Emit the frame prologue. We use a BB here for extra safety as it solves the problem of backwards jumps re-executing the prologue. + auto functionStart = &f.front(); + auto prologueBB = llvm::BasicBlock::Create(f.getContext(), "", &f, functionStart); + irb->SetInsertPoint(prologueBB, prologueBB->begin()); + frame_prologue.insert(irb, f.getContext()); + irb->CreateBr(functionStart); + } + + // Now we start processing + bool terminator_found = false; + for (auto& bb : f) + { + for (auto bit = bb.begin(); bit != bb.end();) + { + const auto instruction_info = decode_instruction(f, &(*bit)); + if (!instruction_info.is_call_inst) + { + ++bit; + continue; + } + + std::string callee_name = "__unknown"; + if (const auto cf = instruction_info.callee) + { + callee_name = cf->getName().str(); + if (cf->hasFnAttribute(llvm::Attribute::AlwaysInline) || callee_name.starts_with("llvm.")) + { + // Always inlined call. Likely inline Asm. Skip + ++bit; + continue; + } + + // Technically We should also ignore any host functions linked in, usually starting with ppu_ or spu_ prefix. + // However, there is not much guarantee that those are safe with only rare exceptions, and it doesn't hurt to patch the frame around them that much anyway. + } + + if (instruction_info.preserve_stack) + { + // Non-tail call. If we have a stack allocated, we preserve it across the call + ++bit; + continue; + } + + ensure(instruction_info.is_tail_call); + terminator_found = true; + + // Now we patch the call if required. For normal calls that 'return' (i.e calls to C/C++ ABI), we do not patch them as they will manage the stack themselves (callee-managed) + bit = patch_tail_call(irb, f, bit, instruction_info, function_info, frame_epilogue); + + // Next + if (bit != bb.end()) + { + ++bit; + } + } + } + + if (!terminator_found) + { + // If we got here, we must be using stack frames. + ensure(function_info.is_leaf && function_info.stack_frame_size > 0, "Leaf function was processed without using stack frames!"); + + // We want to insert a frame cleanup at the tail at every return instruction we find. + for (auto& bb : f) + { + for (auto& i : bb) + { + if (is_ret_instruction(&i)) + { + irb->SetInsertPoint(&i); + frame_epilogue.insert(irb, f.getContext()); + } + } + } + } + } + + llvm::BasicBlock::iterator + GHC_frame_preservation_pass::patch_tail_call( + llvm::IRBuilder<>* irb, + llvm::Function& f, + llvm::BasicBlock::iterator where, + const instruction_info_t& instruction_info, + const function_info_t& function_info, + const UASM& frame_epilogue) + { + auto ci = llvm::dyn_cast(where); + irb->SetInsertPoint(ensure(ci)); + + const auto this_name = f.getName().str(); + + // Insert breadcrumb info before the call + if (m_config.debug_info) + { + // Call-chain tracing + ASMBlock c; + c.mov(x29, x28); + c.mov(x28, x27); + c.adr(x27, UASM::Reg(pc)); + c.insert(irb, f.getContext()); + } + + // Clean up any injected frames before the call + if (function_info.stack_frame_size > 0) + { + frame_epilogue.insert(irb, f.getContext()); + } + + // Insert the next piece after the call, before the ret + ++where; + ensure(llvm::isa(where)); + irb->SetInsertPoint(llvm::dyn_cast(where)); + + if (instruction_info.callee_is_GHC && // Calls to C++ ABI will always return + !instruction_info.is_indirect && // We don't know enough when calling indirectly to know if we'll return or not + instruction_info.callee_name.find("-pp-") == umax) // Skip branch patch-points as those are just indirect calls. TODO: Move this to instruction decode. + { + // We're making a one-way call. This branch shouldn't even bother linking as it will never return here. + ASMBlock c; + c.brk(0x99); + c.insert(irb, f.getContext()); + return where; + } + + // Patch the return path. No GHC call shall ever return to another. If we reach the function endpoint, immediately abort to GW + auto thread_base_reg = get_base_register_for_call(f.getName().str()); + auto arg_index = static_cast(thread_base_reg) - static_cast(x19); + ASMBlock c; + + auto thread_arg = ensure(f.getArg(arg_index)); // Guaranteed to hold our original 'thread' + c.mov(x30, UASM::Var(thread_arg)); + c.ldr(x30, x30, UASM::Imm(m_config.hypervisor_context_offset)); + c.insert(irb, f.getContext()); + + // Next + return where; + } + + bool GHC_frame_preservation_pass::is_ret_instruction(const llvm::Instruction* i) + { + if (llvm::isa(i)) + { + return true; + } + + // Check for inline asm invoking "ret". This really shouldn't be a thing, but it is present in SPULLVMRecompiler for some reason. + if (auto ci = llvm::dyn_cast(i)) + { + if (auto asm_ = llvm::dyn_cast(ci->getCalledOperand())) + { + if (asm_->getAsmString() == "ret") + { + return true; + } + } + } + + return false; + } + + bool GHC_frame_preservation_pass::is_inlined_call(const llvm::CallInst* ci) + { + const auto callee = ci->getCalledFunction(); + if (!callee) + { + // Indirect BLR + return false; + } + + const std::string callee_name = callee->getName().str(); + if (callee_name.starts_with("llvm.")) + { + // Intrinsic + return true; + } + + if (callee->hasFnAttribute(llvm::Attribute::AlwaysInline)) + { + // Assume LLVM always obeys this + return true; + } + + return false; + } + + void GHC_frame_preservation_pass::process_leaf_function(llvm::IRBuilder<>* irb, llvm::Function& f) + { + for (auto &bb : f) + { + for (auto bit = bb.begin(); bit != bb.end();) + { + auto i = llvm::dyn_cast(bit); + if (!is_ret_instruction(i)) + { + continue; + } + + // Insert sequence before the return + irb->SetInsertPoint(llvm::dyn_cast(bit)); + + if (m_config.debug_info) + { + // We need to save the chain return point. + ASMBlock c; + c.mov(x29, x28); + c.mov(x28, x27); + c.adr(x27, UASM::Reg(pc)); + c.insert(irb, f.getContext()); + } + + // Now we need to reload LR. We abuse the function's caller arg set for this to avoid messing with regs too much + auto thread_base_reg = get_base_register_for_call(f.getName().str()); + auto arg_index = static_cast(thread_base_reg) - static_cast(x19); + ASMBlock c; + + auto thread_arg = ensure(f.getArg(arg_index)); // Guaranteed to hold our original 'thread' + c.mov(x30, UASM::Var(thread_arg)); + c.ldr(x30, x30, UASM::Imm(m_config.hypervisor_context_offset)); + c.insert(irb, f.getContext()); + + if (bit != bb.end()) + { + ++bit; + } + } + } + } +} diff --git a/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.h b/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.h new file mode 100644 index 0000000000..ed2a2b08e5 --- /dev/null +++ b/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.h @@ -0,0 +1,86 @@ +#pragma once + +#ifndef ARCH_ARM64 +#error "You have included an arm-only header" +#endif + +#include "AArch64Common.h" + +#include + +namespace aarch64 +{ + class UASM; + using ASMBlock = UASM; + + // On non-x86 architectures GHC runs stackless. SP is treated as a pointer to scratchpad memory. + // This pass keeps this behavior intact while preserving the expectations of the host's C++ ABI. + class GHC_frame_preservation_pass : public translator_pass + { + public: + struct function_info_t + { + u32 instruction_count; + u32 num_external_calls; + u32 stack_frame_size; // Guessing this properly is critical for vector-heavy functions where spilling is a lot more common + bool clobbers_x30; + bool is_leaf; + }; + + struct instruction_info_t + { + bool is_call_inst; // Is a function call. This includes a branch to external code. + bool preserve_stack; // Preserve the stack around this call. + bool is_returning; // This instruction "returns" to the next instruction (typically just llvm::CallInst*) + bool callee_is_GHC; // The other function is GHC + bool is_tail_call; // Tail call. Assume it is an exit/terminator. + bool is_indirect; // Indirect call. Target is the first operand. + llvm::Function* callee; // Callee if any + std::string callee_name; // Name of the callee. + }; + + struct config_t + { + bool debug_info = false; // Record debug information + bool use_stack_frames = true; // Allocate a stack frame for each function. The gateway can alternatively manage a global stack to use as scratch. + bool optimize = true; // Optimize instructions when possible. Set to false when debugging. + u32 hypervisor_context_offset = 0; // Offset within the "thread" object where we can find the hypervisor context (registers configured at gateway). + std::function exclusion_callback; // [Optional] Callback run on each function before transform. Return "true" to exclude from frame processing. + std::vector> base_register_lookup; // [Optional] Function lookup table to determine the location of the "thread" context. + }; + + protected: + std::unordered_set m_visited_functions; + + config_t m_config; + + void force_tail_call_terminators(llvm::Function& f); + + function_info_t preprocess_function(const llvm::Function& f); + + instruction_info_t decode_instruction(const llvm::Function& f, const llvm::Instruction* i); + + bool is_ret_instruction(const llvm::Instruction* i); + + bool is_inlined_call(const llvm::CallInst* ci); + + gpr get_base_register_for_call(const std::string& callee_name, gpr default_reg = gpr::x19); + + void process_leaf_function(llvm::IRBuilder<>* irb, llvm::Function& f); + + llvm::BasicBlock::iterator patch_tail_call( + llvm::IRBuilder<>* irb, + llvm::Function& f, + llvm::BasicBlock::iterator where, + const instruction_info_t& instruction_info, + const function_info_t& function_info, + const ASMBlock& frame_epilogue); + public: + + GHC_frame_preservation_pass(const config_t& configuration); + ~GHC_frame_preservation_pass() = default; + + void run(llvm::IRBuilder<>* irb, llvm::Function& f) override; + void reset() override; + }; +}