From ca4fa1ac74e6f83ea91044bfb5a2352b3fd84b5c Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Sun, 18 Aug 2024 06:42:20 +0300
Subject: [PATCH] Use return-oriented trap approach

---
 rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.cpp | 298 ++++++++++
 rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.h   | 102 ++++
 .../Emu/CPU/Backends/AArch64/AArch64Common.h  |  39 ++
 rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.cpp | 529 ++++++++++++++++++
 rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.h   |  86 +++
 5 files changed, 1054 insertions(+)
 create mode 100644 rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.cpp
 create mode 100644 rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.h
 create mode 100644 rpcs3/Emu/CPU/Backends/AArch64/AArch64Common.h
 create mode 100644 rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.cpp
 create mode 100644 rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.h
diff --git a/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.cpp b/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.cpp
new file mode 100644
index 0000000000..4c8a01e0e2
--- /dev/null
+++ b/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.cpp
@@ -0,0 +1,298 @@
+#include "stdafx.h"
+#include "AArch64ASM.h"
+
+#include "Utilities/StrFmt.h"
+
+namespace aarch64
+{
+    using fmt_replacement_list_t = std::vector<std::pair<std::string, std::string>>;
+
+    void UASM::embed_args(compiled_instruction_t& instruction, const std::vector<Arg>& args, const std::vector<gpr>& clobbered)
+    {
+        for (const auto& arg : args)
+        {
+            switch (arg.type)
+            {
+            case ArgType::Immediate:
+            case ArgType::Register:
+            case ArgType::SRegister:
+                // Embedded directly
+                break;
+            case ArgType::LLVMInt:
+                instruction.constraints.push_back("i");
+                instruction.args.push_back(arg.value);
+                break;
+            case ArgType::LLVMReg:
+                instruction.constraints.push_back("r");
+                instruction.args.push_back(arg.value);
+                break;
+            case ArgType::LLVMPtr:
+                instruction.constraints.push_back("m");
+                instruction.args.push_back(arg.value);
+                break;
+            default:
+                break;
+            }
+        }
+
+        for (const auto& reg : clobbered)
+        {
+            const auto clobber = fmt::format("~{%s}", gpr_names[static_cast<int>(reg)]);
+            instruction.constraints.push_back(clobber);
+        }
+    }
+
+    void UASM::emit0(const char* inst)
+    {
+        compiled_instruction_t i{};
+        i.asm_ = inst;
+        m_instructions.push_back(i);
+    }
+
+    void UASM::emit1(const char* inst, const Arg& arg0, const std::vector<gpr>& clobbered)
+    {
+        int arg_id = 0;
+        fmt_replacement_list_t repl = {
+            { "{0}", arg0.to_string(&arg_id) }
+        };
+
+        compiled_instruction_t i{};
+        i.asm_ = fmt::replace_all(inst, repl);
+        embed_args(i, { arg0 }, clobbered);
+        m_instructions.push_back(i);
+    }
+
+    void UASM::emit2(const char* inst, const Arg& arg0, const Arg& arg1, const std::vector<gpr>& clobbered)
+    {
+        int arg_id = 0;
+        fmt_replacement_list_t repl = {
+            { "{0}", arg0.to_string(&arg_id) },
+            { "{1}", arg1.to_string(&arg_id) },
+        };
+
+        compiled_instruction_t i{};
+        i.asm_ = fmt::replace_all(inst, repl);
+        embed_args(i, { arg0, arg1 }, clobbered);
+        m_instructions.push_back(i);
+    }
+
+    void UASM::emit3(const char* inst, const Arg& arg0, const Arg& arg1, const Arg& arg2, const std::vector<gpr>& clobbered)
+    {
+        int arg_id = 0;
+        fmt_replacement_list_t repl = {
+            { "{0}", arg0.to_string(&arg_id) },
+            { "{1}", arg1.to_string(&arg_id) },
+            { "{2}", arg2.to_string(&arg_id) },
+        };
+
+        compiled_instruction_t i{};
+        i.asm_ = fmt::replace_all(inst, repl);
+        embed_args(i, { arg0, arg1, arg2 }, clobbered);
+        m_instructions.push_back(i);
+    }
+
+    void UASM::emit4(const char* inst, const Arg& arg0, const Arg& arg1, const Arg& arg2, const Arg& arg3, const std::vector<gpr>& clobbered)
+    {
+        int arg_id = 0;
+        fmt_replacement_list_t repl = {
+            { "{0}", arg0.to_string(&arg_id) },
+            { "{1}", arg1.to_string(&arg_id) },
+            { "{2}", arg2.to_string(&arg_id) },
+            { "{3}", arg3.to_string(&arg_id) },
+        };
+
+        compiled_instruction_t i{};
+        i.asm_ = fmt::replace_all(inst, repl);
+        embed_args(i, { arg0, arg1, arg2, arg3 }, clobbered);
+        m_instructions.push_back(i);
+    }
+
+    void UASM::insert(llvm::IRBuilder<>* irb, llvm::LLVMContext& ctx) const
+    {
+        for (const auto& inst : m_instructions)
+        {
+            auto constraints = fmt::merge(inst.constraints, ",");
+            llvm_asm(irb, inst.asm_, inst.args, constraints, ctx);
+        }
+    }
+
+    void UASM::append(const UASM& that)
+    {
+        m_instructions.reserve(m_instructions.size() + that.m_instructions.size());
+        std::copy(that.m_instructions.begin(), that.m_instructions.end(), std::back_inserter(this->m_instructions));
+    }
+
+    void UASM::prepend(const UASM& that)
+    {
+        auto new_instructions = that.m_instructions;
+        new_instructions.reserve(m_instructions.size() + that.m_instructions.size());
+        std::copy(m_instructions.begin(), m_instructions.end(), std::back_inserter(new_instructions));
+        m_instructions = std::move(new_instructions);
+    }
+
+    // Convenience arg wrappers
+    UASM::Arg UASM::Int(llvm::Value* v)
+    {
+        return Arg {
+            .type = ArgType::LLVMInt,
+            .value = v
+        };
+    }
+
+    UASM::Arg UASM::Imm(s64 v)
+    {
+        return Arg {
+            .type = ArgType::Immediate,
+            .imm = v
+        };
+    }
+
+    UASM::Arg UASM::Reg(gpr reg)
+    {
+        return Arg {
+            .type = ArgType::Register,
+            .reg = reg
+        };
+    }
+
+    UASM::Arg UASM::Reg(spr reg)
+    {
+        return Arg {
+            .type = ArgType::SRegister,
+            .sreg = reg
+        };
+    }
+
+    UASM::Arg UASM::Ptr(llvm::Value* v)
+    {
+        return Arg {
+            .type = ArgType::LLVMPtr,
+            .value = v
+        };
+    }
+
+    UASM::Arg UASM::Var(llvm::Value* v)
+    {
+        return Arg {
+            .type = ArgType::LLVMReg,
+            .value = v
+        };
+    }
+
+    void UASM::mov(gpr dst, gpr src)
+    {
+        emit2("mov {0}, {1}", Reg(dst), Reg(src), { dst });
+    }
+
+    void UASM::mov(gpr dst, const Arg& src)
+    {
+        emit2("mov {0}, {1}", Reg(dst), src, { dst });
+    }
+
+    void UASM::movnt(gpr dst, const Arg& src)
+    {
+        emit2("mov {0}, {1}", Reg(dst), src, {});
+    }
+
+    void UASM::str(gpr src, gpr base, const Arg& offset)
+    {
+        emit3("str {0}, [{1}, {2}]", Reg(src), Reg(base), offset, {});
+    }
+
+    void UASM::str(const Arg& src, spr base, const Arg& offset)
+    {
+        emit3("str {0}, [{1}, {2}]", src, Reg(base), offset, {});
+    }
+
+    void UASM::ldr(gpr dst, gpr base, const Arg& offset)
+    {
+        emit3("ldr {0}, [{1}, {2}]", Reg(dst), Reg(base), offset, { dst });
+    }
+
+    void UASM::ldr(gpr dst, spr base, const Arg& offset)
+    {
+        emit3("ldr {0}, [{1}, {2}]", Reg(dst), Reg(base), offset, { dst });
+    }
+
+    void UASM::stp(gpr src0, gpr src1, gpr base, const Arg& offset)
+    {
+        emit4("stp {0}, {1}, [{2}, {3}]", Reg(src0), Reg(src1), Reg(base), offset, {});
+    }
+
+    void UASM::ldp(gpr dst0, gpr dst1, gpr base, const Arg& offset)
+    {
+        emit4("ldp {0}, {1}, [{2}, {3}]", Reg(dst0), Reg(dst1), Reg(base), offset, { dst0, dst1 });
+    }
+
+    void UASM::b(const Arg& target)
+    {
+        emit1("b {0}", target, {});
+    }
+
+    void UASM::br(gpr target)
+    {
+        emit1("br {0}", Reg(target), {});
+    }
+
+    void UASM::br(const Arg& target)
+    {
+        emit1("br {0}", target, {});
+    }
+
+    void UASM::ret()
+    {
+        emit0("ret");
+    }
+
+    void UASM::adr(gpr dst, const Arg& src)
+    {
+        emit2("adr {0}, {1}", Reg(dst), src, { dst });
+    }
+
+    void UASM::add(spr dst, spr src0, const Arg& src1)
+    {
+        emit3("add {0}, {1}, {2}", Reg(dst), Reg(src0), src1, {});
+    }
+
+    void UASM::sub(spr dst, spr src0, const Arg& src1)
+    {
+        emit3("sub {0}, {1}, {2}", Reg(dst), Reg(src0), src1, {});
+    }
+
+    void UASM::nop(const std::vector<Arg>& refs)
+    {
+        emit0("nop");
+        embed_args(m_instructions.back(), refs, {});
+    }
+
+    void UASM::brk(int mark)
+    {
+        emit1("brk {0}", Imm(mark), {});
+    }
+
+    std::string UASM::Arg::to_string(int* id) const
+    {
+        // Safety checks around the optional arg incrementer
+        int dummy = 0;
+        if (!id)
+        {
+            id = &dummy;
+        }
+
+        switch (type)
+        {
+        case ArgType::Immediate:
+            return std::string("#") + std::to_string(imm);
+        case ArgType::Register:
+            return gpr_names[static_cast<int>(reg)];
+        case ArgType::SRegister:
+            return spr_asm_names[static_cast<int>(sreg)];
+        case ArgType::LLVMInt:
+        case ArgType::LLVMReg:
+        case ArgType::LLVMPtr:
+        default:
+            // Return placeholder identifier
+            return std::string("$") + std::to_string(*id++);
+        }
+    }
+}
diff --git a/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.h b/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.h
new file mode 100644
index 0000000000..f494fafc63
--- /dev/null
+++ b/rpcs3/Emu/CPU/Backends/AArch64/AArch64ASM.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include "AArch64Common.h"
+
+namespace aarch64
+{
+    // Micro-assembler
+    class UASM
+    {
+    public:
+        enum ArgType
+        {
+            Register = 0,
+            SRegister,
+            Immediate,
+            LLVMInt,
+            LLVMPtr,
+            LLVMReg
+        };
+
+        struct Arg
+        {
+            ArgType type;
+            union
+            {
+                llvm::Value* value;
+                gpr reg;
+                spr sreg;
+                s64 imm;
+            };
+
+            std::string to_string(int* id = nullptr) const;
+        };
+
+    protected:
+        struct compiled_instruction_t
+        {
+            std::string asm_;
+            std::vector<std::string> constraints;
+            std::vector<llvm::Value*> args;
+        };
+
+        std::vector<compiled_instruction_t> m_instructions;
+
+        void emit0(const char* inst);
+        void emit1(const char* inst, const Arg& arg0, const std::vector<gpr>& clobbered);
+        void emit2(const char* inst, const Arg& arg0, const Arg& arg1, const std::vector<gpr>& clobbered);
+        void emit3(const char* inst, const Arg& arg0, const Arg& arg1, const Arg& arg2, const std::vector<gpr>& clobbered);
+        void emit4(const char* inst, const Arg& arg0, const Arg& arg1, const Arg& arg2, const Arg& arg3, const std::vector<gpr>& clobbered);
+
+        void embed_args(compiled_instruction_t& instruction, const std::vector<Arg>& args, const std::vector<gpr>& clobbered);
+
+    public:
+        UASM() = default;
+
+        // Convenience wrappers
+        static Arg Int(llvm::Value* v);
+        static Arg Imm(s64 v);
+        static Arg Reg(gpr reg);
+        static Arg Reg(spr reg);
+        static Arg Ptr(llvm::Value* v);
+        static Arg Var(llvm::Value* v);
+
+        void mov(gpr dst, gpr src);
+        void mov(gpr dst, const Arg& src);
+        void movnt(gpr dst, const Arg& src);
+
+        void adr(gpr dst, const Arg& src);
+
+        void str(gpr src, gpr base, const Arg& offset);
+        void str(gpr src, spr base, const Arg& offset);
+        void str(const Arg& src, gpr base, const Arg& offset);
+        void str(const Arg& src, spr base, const Arg& offset);
+        void ldr(gpr dst, gpr base, const Arg& offset);
+        void ldr(gpr dst, spr base, const Arg& offset);
+
+        void stp(gpr src0, gpr src1, gpr base, const Arg& offset);
+        void stp(gpr src0, gpr src1, spr base, const Arg& offset);
+        void ldp(gpr dst0, gpr dst1, gpr base, const Arg& offset);
+        void ldp(gpr dst0, gpr dst1, spr base, const Arg& offset);
+
+        void add(spr dst, spr src0, const Arg& src1);
+        void add(gpr dst, gpr src0, const Arg& src1);
+        void sub(spr dst, spr src0, const Arg& src1);
+        void sub(gpr dst, gpr src0, const Arg& src1);
+
+        void b(const Arg& target);
+        void br(gpr target);
+        void br(const Arg& target);
+        void ret();
+
+        void nop(const std::vector<Arg>& refs = {});
+        void brk(int mark = 0);
+
+        void append(const UASM& other);
+        void prepend(const UASM& other);
+
+        void insert(llvm::IRBuilder<>* irb, llvm::LLVMContext& ctx) const;
+    };
+
+    using ASMBlock = UASM;
+}
diff --git a/rpcs3/Emu/CPU/Backends/AArch64/AArch64Common.h b/rpcs3/Emu/CPU/Backends/AArch64/AArch64Common.h
new file mode 100644
index 0000000000..9b77e5eb36
--- /dev/null
+++ b/rpcs3/Emu/CPU/Backends/AArch64/AArch64Common.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <util/types.hpp>
+#include "../../CPUTranslator.h"
+
+namespace aarch64
+{
+    enum gpr : s32
+    {
+        x0 = 0,
+        x1, x2, x3, x4, x5, x6, x7, x8, x9,
+        x10, x11, x12, x13, x14, x15, x16, x17, x18, x19,
+        x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30
+    };
+
+    enum spr : s32
+    {
+        xzr = 0,
+        pc,
+        sp
+    };
+
+    static const char* gpr_names[] =
+    {
+        "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
+        "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
+        "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", "x30"
+    };
+
+    static const char* spr_names[] =
+    {
+        "xzr", "pc", "sp"
+    };
+
+    static const char* spr_asm_names[] =
+    {
+        "xzr", ".", "sp"
+    };
+}
diff --git a/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.cpp b/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.cpp
new file mode 100644
index 0000000000..6235aa4ea4
--- /dev/null
+++ b/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.cpp
@@ -0,0 +1,529 @@
+#include "stdafx.h"
+
+
+#include "AArch64JIT.h"
+#include "AArch64ASM.h"
+
+LOG_CHANNEL(jit_log, "JIT");
+
+#define STDOUT_DEBUG 0
+
+#define DPRINT1(...)\
+    do {\
+        printf(__VA_ARGS__);\
+        printf("\n");\
+        fflush(stdout);\
+    } while (0)
+
+#if STDOUT_DEBUG
+#define DPRINT DPRINT1
+#else
+#define DPRINT jit_log.trace
+#endif
+
+namespace aarch64
+{
+    using instruction_info_t = GHC_frame_preservation_pass::instruction_info_t;
+    using function_info_t = GHC_frame_preservation_pass::function_info_t;
+
+    GHC_frame_preservation_pass::GHC_frame_preservation_pass(const config_t& configuration)
+        : m_config(configuration)
+    {}
+
+    void GHC_frame_preservation_pass::reset()
+    {
+        m_visited_functions.clear();
+    }
+
+    void GHC_frame_preservation_pass::force_tail_call_terminators(llvm::Function& f)
+    {
+        // GHC functions are not call-stack preserving and can therefore never return if they make any external calls at all.
+        // Replace every terminator clause with a tail call explicitly. This is already done for X64 to work, but better safe than sorry.
+        for (auto& bb : f)
+        {
+            auto bit = bb.begin(), prev = bb.end();
+            for (; bit != bb.end(); prev = bit, ++bit)
+            {
+                if (prev == bb.end())
+                {
+                    continue;
+                }
+
+                if (llvm::isa<llvm::ReturnInst>(&*bit))
+                {
+                    if (auto ci = llvm::dyn_cast<llvm::CallInst>(&*prev))
+                    {
+                        // This is a "ret" that is coming after a "call" to another funciton.
+                        // Enforce that it must be a tail call.
+                        if (!ci->isTailCall())
+                        {
+                            ci->setTailCall();
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    function_info_t GHC_frame_preservation_pass::preprocess_function(const llvm::Function& f)
+    {
+        function_info_t result{};
+        result.instruction_count = f.getInstructionCount();
+
+        // Blanket exclusions. Stubs or dispatchers that do not compute anything themselves.
+        if (f.getName() == "__spu-null")
+        {
+            // Don't waste the effort processing this stub. It has no points of concern
+            result.num_external_calls = 1;
+            return result;
+        }
+
+        if (m_config.use_stack_frames)
+        {
+            // Stack frame estimation. SPU code can be very long and consumes several KB of stack.
+            u32 stack_frame_size = 128u;
+            // Actual ratio is usually around 1:4
+            const u32 expected_compiled_instr_count = f.getInstructionCount() * 4;
+            // Because GHC doesn't preserve stack (all stack is scratch), we know we'll start to spill once we go over the number of actual regs.
+            // We use a naive allocator that just assumes each instruction consumes a register slot. We "spill" every 32 instructions.
+            // FIXME: Aggressive spill is only really a thing with vector operations. We can detect those instead.
+            // A proper fix is to port this to a MF pass, but I have PTSD from working at MF level.
+            const u32 spill_pages = (expected_compiled_instr_count + 127u) / 128u;
+            stack_frame_size *= std::min(spill_pages, 32u); // 128 to 4k dynamic. It is unlikely that any frame consumes more than 4096 bytes
+
+            result.stack_frame_size = stack_frame_size;
+        }
+
+        result.instruction_count = f.getInstructionCount();
+        result.num_external_calls = 0;
+
+        // The LR is not spared by LLVM in cases where there is a lot of spilling.
+        // This is much easier to manage with a custom LLVM branch as we can just mark X30 as off-limits as a GPR.
+        // This is another thing to be moved to a MachineFunction pass. Ideally we should check the instruction stream for writes to LR and reload it on exit.
+        // For now, assume it is dirtied if the function is of any reasonable length.
+        result.clobbers_x30 = result.instruction_count > 32;
+        result.is_leaf = true;
+
+        for (auto& bb : f)
+        {
+            for (auto& inst : bb)
+            {
+                if (auto ci = llvm::dyn_cast<llvm::CallInst>(&inst))
+                {
+                    if (llvm::isa<llvm::InlineAsm>(ci->getCalledOperand()))
+                    {
+                        // Inline ASM blocks are ignored
+                        continue;
+                    }
+
+                    result.num_external_calls++;
+                    if (ci->isTailCall())
+                    {
+                        // This is not a leaf if it has at least one exit point / terminator that is not a return instruction.
+                        result.is_leaf = false;
+                    }
+                    else
+                    {
+                        // Returning calls always clobber x30
+                        result.clobbers_x30 = true;
+                    }
+                }
+            }
+        }
+
+        return result;
+    }
+
+    instruction_info_t GHC_frame_preservation_pass::decode_instruction(const llvm::Function& f, const llvm::Instruction* i)
+    {
+        instruction_info_t result{};
+        if (auto ci = llvm::dyn_cast<llvm::CallInst>(i))
+        {
+            // Watch out for injected ASM blocks...
+            if (llvm::isa<llvm::InlineAsm>(ci->getCalledOperand()))
+            {
+                // Not a real call. This is just an insert of inline asm
+                return result;
+            }
+
+            result.is_call_inst = true;
+            result.is_returning = true;
+            result.preserve_stack = !ci->isTailCall();
+            result.callee = ci->getCalledFunction();
+            result.is_tail_call = ci->isTailCall();
+
+            if (!result.callee)
+            {
+                // Indirect call (call from raw value).
+                result.is_indirect = true;
+                result.callee_is_GHC = ci->getCallingConv() == llvm::CallingConv::GHC;
+                result.callee_name = "__indirect_call";
+            }
+            else
+            {
+                result.callee_is_GHC = result.callee->getCallingConv() == llvm::CallingConv::GHC;
+                result.callee_name = result.callee->getName().str();
+            }
+            return result;
+        }
+
+        if (auto bi = llvm::dyn_cast<llvm::BranchInst>(i))
+        {
+            // More likely to jump out via an unconditional...
+            if (!bi->isConditional())
+            {
+                ensure(bi->getNumSuccessors() == 1);
+                auto targetbb = bi->getSuccessor(0);
+
+                result.callee = targetbb->getParent();
+                result.callee_name = result.callee->getName().str();
+                result.is_call_inst = result.callee_name != f.getName();
+            }
+
+            return result;
+        }
+
+        if (auto bi = llvm::dyn_cast<llvm::IndirectBrInst>(i))
+        {
+            // Very unlikely to be the same function. Can be considered a function exit.
+            ensure(bi->getNumDestinations() == 1);
+            auto targetbb = ensure(bi->getSuccessor(0)); // This is guaranteed to fail but I've yet to encounter this
+
+            result.callee = targetbb->getParent();
+            result.callee_name = result.callee->getName().str();
+            result.is_call_inst = result.callee_name != f.getName();
+            return result;
+        }
+
+        if (auto bi = llvm::dyn_cast<llvm::CallBrInst>(i))
+        {
+            ensure(bi->getNumSuccessors() == 1);
+            auto targetbb = bi->getSuccessor(0);
+
+            result.callee = targetbb->getParent();
+            result.callee_name = result.callee->getName().str();
+            result.is_call_inst = result.callee_name != f.getName();
+            return result;
+        }
+
+        if (auto bi = llvm::dyn_cast<llvm::InvokeInst>(i))
+        {
+            ensure(bi->getNumSuccessors() == 2);
+            auto targetbb = bi->getSuccessor(0);
+
+            result.callee = targetbb->getParent();
+            result.callee_name = result.callee->getName().str();
+            result.is_call_inst = result.callee_name != f.getName();
+            return result;
+        }
+
+        return result;
+    }
+
+    gpr GHC_frame_preservation_pass::get_base_register_for_call(const std::string& callee_name, gpr default_reg)
+    {
+        // We go over the base_register_lookup table and find the first matching pattern
+        for (const auto& pattern : m_config.base_register_lookup)
+        {
+            if (callee_name.starts_with(pattern.first))
+            {
+                return pattern.second;
+            }
+        }
+
+        return default_reg;
+    }
+
+    void GHC_frame_preservation_pass::run(llvm::IRBuilder<>* irb, llvm::Function& f)
+    {
+        if (f.getCallingConv() != llvm::CallingConv::GHC)
+        {
+            // If we're not doing GHC, the calling conv will have stack fixup on its own via prologue/epilogue
+            return;
+        }
+
+        if (f.getInstructionCount() == 0)
+        {
+            // Nothing to do. Happens with placeholder functions such as branch patchpoints
+            return;
+        }
+
+        const auto this_name = f.getName().str();
+        if (m_visited_functions.find(this_name) != m_visited_functions.end())
+        {
+            // Already processed. Only useful when recursing which is currently not used.
+            DPRINT("Function %s was already processed. Skipping.\n", this_name.c_str());
+            return;
+        }
+
+        if (this_name != "__spu-null") // This name is meaningless and doesn't uniquely identify a function
+        {
+            m_visited_functions.insert(this_name);
+        }
+
+        if (m_config.exclusion_callback && m_config.exclusion_callback(this_name))
+        {
+            // Function is explicitly excluded
+            return;
+        }
+
+        // Preprocessing.
+        auto function_info = preprocess_function(f);
+        if (function_info.num_external_calls == 0 && function_info.stack_frame_size == 0)
+        {
+            // No stack frame injection and no external calls to patch up. This is a leaf function, nothing to do.
+            DPRINT("Ignoring function %s", this_name.c_str());
+            return;
+        }
+
+        // Force tail calls on all terminators
+        force_tail_call_terminators(f);
+
+        // Check for leaves
+        if (function_info.is_leaf && !m_config.use_stack_frames)
+        {
+            // Sanity check. If this function had no returning calls, it should have been omitted from processing.
+            ensure(function_info.clobbers_x30, "Function has no terminator and no non-tail calls but was allowed for frame processing!");
+            DPRINT("Function %s is a leaf.", this_name.c_str());
+            process_leaf_function(irb, f);
+            return;
+        }
+
+        // Asm snippets for patching stack frame
+        ASMBlock frame_prologue, frame_epilogue;
+
+        if (function_info.stack_frame_size > 0)
+        {
+            // NOTE: The stack frame here is purely optional, we can pre-allocate scratch on the gateway.
+            // However, that is an optimization for another time, this helps make debugging easier.
+            frame_prologue.sub(sp, sp, UASM::Imm(function_info.stack_frame_size));
+            frame_epilogue.add(sp, sp, UASM::Imm(function_info.stack_frame_size));
+
+            // Emit the frame prologue. We use a BB here for extra safety as it solves the problem of backwards jumps re-executing the prologue.
+            auto functionStart = &f.front();
+            auto prologueBB = llvm::BasicBlock::Create(f.getContext(), "", &f, functionStart);
+            irb->SetInsertPoint(prologueBB, prologueBB->begin());
+            frame_prologue.insert(irb, f.getContext());
+            irb->CreateBr(functionStart);
+        }
+
+        // Now we start processing
+        bool terminator_found = false;
+        for (auto& bb : f)
+        {
+            for (auto bit = bb.begin(); bit != bb.end();)
+            {
+                const auto instruction_info = decode_instruction(f, &(*bit));
+                if (!instruction_info.is_call_inst)
+                {
+                    ++bit;
+                    continue;
+                }
+
+                std::string callee_name = "__unknown";
+                if (const auto cf = instruction_info.callee)
+                {
+                    callee_name = cf->getName().str();
+                    if (cf->hasFnAttribute(llvm::Attribute::AlwaysInline) || callee_name.starts_with("llvm."))
+                    {
+                        // Always inlined call. Likely inline Asm. Skip
+                        ++bit;
+                        continue;
+                    }
+
+                    // Technically We should also ignore any host functions linked in, usually starting with ppu_ or spu_ prefix.
+                    // However, there is not much guarantee that those are safe with only rare exceptions, and it doesn't hurt to patch the frame around them that much anyway.
+                }
+
+                if (instruction_info.preserve_stack)
+                {
+                    // Non-tail call. If we have a stack allocated, we preserve it across the call
+                    ++bit;
+                    continue;
+                }
+
+                ensure(instruction_info.is_tail_call);
+                terminator_found = true;
+
+                // Now we patch the call if required. For normal calls that 'return' (i.e calls to C/C++ ABI), we do not patch them as they will manage the stack themselves (callee-managed)
+                bit = patch_tail_call(irb, f, bit, instruction_info, function_info, frame_epilogue);
+
+                // Next
+                if (bit != bb.end())
+                {
+                    ++bit;
+                }
+            }
+        }
+
+        if (!terminator_found)
+        {
+            // If we got here, we must be using stack frames.
+            ensure(function_info.is_leaf && function_info.stack_frame_size > 0, "Leaf function was processed without using stack frames!");
+
+            // We want to insert a frame cleanup at the tail at every return instruction we find.
+            for (auto& bb : f)
+            {
+                for (auto& i : bb)
+                {
+                    if (is_ret_instruction(&i))
+                    {
+                        irb->SetInsertPoint(&i);
+                        frame_epilogue.insert(irb, f.getContext());
+                    }
+                }
+            }
+        }
+    }
+
+    llvm::BasicBlock::iterator
+    GHC_frame_preservation_pass::patch_tail_call(
+        llvm::IRBuilder<>* irb,
+        llvm::Function& f,
+        llvm::BasicBlock::iterator where,
+        const instruction_info_t& instruction_info,
+        const function_info_t& function_info,
+        const UASM& frame_epilogue)
+    {
+        auto ci = llvm::dyn_cast<llvm::CallInst>(where);
+        irb->SetInsertPoint(ensure(ci));
+
+        const auto this_name = f.getName().str();
+
+        // Insert breadcrumb info before the call
+        if (m_config.debug_info)
+        {
+            // Call-chain tracing
+            ASMBlock c;
+            c.mov(x29, x28);
+            c.mov(x28, x27);
+            c.adr(x27, UASM::Reg(pc));
+            c.insert(irb, f.getContext());
+        }
+
+        // Clean up any injected frames before the call
+        if (function_info.stack_frame_size > 0)
+        {
+            frame_epilogue.insert(irb, f.getContext());
+        }
+
+        // Insert the next piece after the call, before the ret
+        ++where;
+        ensure(llvm::isa<llvm::ReturnInst>(where));
+        irb->SetInsertPoint(llvm::dyn_cast<llvm::Instruction>(where));
+
+        if (instruction_info.callee_is_GHC &&                      // Calls to C++ ABI will always return
+            !instruction_info.is_indirect &&                       // We don't know enough when calling indirectly to know if we'll return or not
+            instruction_info.callee_name.find("-pp-") == umax)     // Skip branch patch-points as those are just indirect calls. TODO: Move this to instruction decode.
+        {
+            // We're making a one-way call. This branch shouldn't even bother linking as it will never return here.
+            ASMBlock c;
+            c.brk(0x99);
+            c.insert(irb, f.getContext());
+            return where;
+        }
+
+        // Patch the return path. No GHC call shall ever return to another. If we reach the function endpoint, immediately abort to GW
+        auto thread_base_reg = get_base_register_for_call(f.getName().str());
+        auto arg_index = static_cast<int>(thread_base_reg) - static_cast<int>(x19);
+        ASMBlock c;
+
+        auto thread_arg = ensure(f.getArg(arg_index)); // Guaranteed to hold our original 'thread'
+        c.mov(x30, UASM::Var(thread_arg));
+        c.ldr(x30, x30, UASM::Imm(m_config.hypervisor_context_offset));
+        c.insert(irb, f.getContext());
+
+        // Next
+        return where;
+    }
+
+    bool GHC_frame_preservation_pass::is_ret_instruction(const llvm::Instruction* i)
+    {
+        if (llvm::isa<llvm::ReturnInst>(i))
+        {
+            return true;
+        }
+
+        // Check for inline asm invoking "ret". This really shouldn't be a thing, but it is present in SPULLVMRecompiler for some reason.
+        if (auto ci = llvm::dyn_cast<llvm::CallInst>(i))
+        {
+            if (auto asm_ = llvm::dyn_cast<llvm::InlineAsm>(ci->getCalledOperand()))
+            {
+                if (asm_->getAsmString() == "ret")
+                {
+                    return true;
+                }
+            }
+        }
+
+        return false;
+    }
+
+    bool GHC_frame_preservation_pass::is_inlined_call(const llvm::CallInst* ci)
+    {
+        const auto callee = ci->getCalledFunction();
+        if (!callee)
+        {
+            // Indirect BLR
+            return false;
+        }
+
+        const std::string callee_name = callee->getName().str();
+        if (callee_name.starts_with("llvm."))
+        {
+            // Intrinsic
+            return true;
+        }
+
+        if (callee->hasFnAttribute(llvm::Attribute::AlwaysInline))
+        {
+            // Assume LLVM always obeys this
+            return true;
+        }
+
+        return false;
+    }
+
+    void GHC_frame_preservation_pass::process_leaf_function(llvm::IRBuilder<>* irb, llvm::Function& f)
+    {
+        for (auto &bb : f)
+        {
+            for (auto bit = bb.begin(); bit != bb.end();)
+            {
+                auto i = llvm::dyn_cast<llvm::Instruction>(bit);
+                if (!is_ret_instruction(i))
+                {
+                    continue;
+                }
+
+                // Insert sequence before the return
+                irb->SetInsertPoint(llvm::dyn_cast<llvm::Instruction>(bit));
+
+                if (m_config.debug_info)
+                {
+                    // We need to save the chain return point.
+                    ASMBlock c;
+                    c.mov(x29, x28);
+                    c.mov(x28, x27);
+                    c.adr(x27, UASM::Reg(pc));
+                    c.insert(irb, f.getContext());
+                }
+
+                // Now we need to reload LR. We abuse the function's caller arg set for this to avoid messing with regs too much
+                auto thread_base_reg = get_base_register_for_call(f.getName().str());
+                auto arg_index = static_cast<int>(thread_base_reg) - static_cast<int>(x19);
+                ASMBlock c;
+
+                auto thread_arg = ensure(f.getArg(arg_index)); // Guaranteed to hold our original 'thread'
+                c.mov(x30, UASM::Var(thread_arg));
+                c.ldr(x30, x30, UASM::Imm(m_config.hypervisor_context_offset));
+                c.insert(irb, f.getContext());
+
+                if (bit != bb.end())
+                {
+                    ++bit;
+                }
+            }
+        }
+    }
+}
diff --git a/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.h b/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.h
new file mode 100644
index 0000000000..ed2a2b08e5
--- /dev/null
+++ b/rpcs3/Emu/CPU/Backends/AArch64/AArch64JIT.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#ifndef ARCH_ARM64
+#error "You have included an arm-only header"
+#endif
+
+#include "AArch64Common.h"
+
+#include <unordered_set>
+
+namespace aarch64
+{
+    class UASM;
+    using ASMBlock = UASM;
+
+    // On non-x86 architectures GHC runs stackless. SP is treated as a pointer to scratchpad memory.
+    // This pass keeps this behavior intact while preserving the expectations of the host's C++ ABI.
+    class GHC_frame_preservation_pass : public translator_pass
+    {
+    public:
+        struct function_info_t
+        {
+            u32 instruction_count;
+            u32 num_external_calls;
+            u32 stack_frame_size;     // Guessing this properly is critical for vector-heavy functions where spilling is a lot more common
+            bool clobbers_x30;
+            bool is_leaf;
+        };
+
+        struct instruction_info_t
+        {
+            bool is_call_inst;        // Is a function call. This includes a branch to external code.
+            bool preserve_stack;      // Preserve the stack around this call.
+            bool is_returning;        // This instruction "returns" to the next instruction (typically just llvm::CallInst*)
+            bool callee_is_GHC;       // The other function is GHC
+            bool is_tail_call;        // Tail call. Assume it is an exit/terminator.
+            bool is_indirect;         // Indirect call. Target is the first operand.
+            llvm::Function* callee;   // Callee if any
+            std::string callee_name;  // Name of the callee.
+        };
+
+        struct config_t
+        {
+            bool debug_info = false;         // Record debug information
+            bool use_stack_frames = true;    // Allocate a stack frame for each function. The gateway can alternatively manage a global stack to use as scratch.
+            bool optimize = true;            // Optimize instructions when possible. Set to false when debugging.
+            u32 hypervisor_context_offset = 0; // Offset within the "thread" object where we can find the hypervisor context (registers configured at gateway).
+            std::function<bool(const std::string&)> exclusion_callback;    // [Optional] Callback run on each function before transform. Return "true" to exclude from frame processing.
+            std::vector<std::pair<std::string, gpr>> base_register_lookup; // [Optional] Function lookup table to determine the location of the "thread" context.
+        };
+
+    protected:
+        std::unordered_set<std::string> m_visited_functions;
+
+        config_t m_config;
+
+        void force_tail_call_terminators(llvm::Function& f);
+
+        function_info_t preprocess_function(const llvm::Function& f);
+
+        instruction_info_t decode_instruction(const llvm::Function& f, const llvm::Instruction* i);
+
+        bool is_ret_instruction(const llvm::Instruction* i);
+
+        bool is_inlined_call(const llvm::CallInst* ci);
+
+        gpr get_base_register_for_call(const std::string& callee_name, gpr default_reg = gpr::x19);
+
+        void process_leaf_function(llvm::IRBuilder<>* irb, llvm::Function& f);
+
+        llvm::BasicBlock::iterator patch_tail_call(
+            llvm::IRBuilder<>* irb,
+            llvm::Function& f,
+            llvm::BasicBlock::iterator where,
+            const instruction_info_t& instruction_info,
+            const function_info_t& function_info,
+            const ASMBlock& frame_epilogue);
+    public:
+
+        GHC_frame_preservation_pass(const config_t& configuration);
+        ~GHC_frame_preservation_pass() = default;
+
+        void run(llvm::IRBuilder<>* irb, llvm::Function& f) override;
+        void reset() override;
+    };
+}