/* Copyright 2020 flyinghead This file is part of flycast. flycast is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. flycast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with flycast. If not, see . */ #include "build.h" #if HOST_CPU == CPU_ARM64 && FEAT_AREC != DYNAREC_NONE #include #include "arm7_rec.h" #include "hw/mem/_vmem.h" #include using namespace vixl::aarch64; //#include #include "rec-ARM64/arm64_unwind.h" namespace aicaarm { static void (*arm_dispatch)(); // Not an executable address class Arm7Compiler; #define MAX_REGS 8 static Arm64UnwindInfo unwinder; class AArch64ArmRegAlloc : public ArmRegAlloc { Arm7Compiler& assembler; void LoadReg(int host_reg, Arm7Reg armreg); void StoreReg(int host_reg, Arm7Reg armreg); static const WRegister& getReg(int i) { static const WRegister regs[] = { w19, w20, w21, w22, w23, w24, w25, w27 }; static_assert(MAX_REGS == ARRAY_SIZE(regs), "MAX_REGS == ARRAY_SIZE(regs)"); verify(i >= 0 && (u32)i < ARRAY_SIZE(regs)); return regs[i]; } public: AArch64ArmRegAlloc(Arm7Compiler& assembler, const std::vector& block_ops) : ArmRegAlloc(block_ops), assembler(assembler) {} const WRegister& map(Arm7Reg r) { int i = ArmRegAlloc::map(r); return getReg(i); } friend class ArmRegAlloc; }; static MemOperand arm_reg_operand(Arm7Reg reg) { return MemOperand(x28, (u8*)&arm_Reg[reg].I - (u8*)&arm_Reg[0].I); } class Arm7Compiler : public MacroAssembler { bool logical_op_set_flags = false; bool set_carry_bit = false; bool set_flags = false; AArch64ArmRegAlloc *regalloc = nullptr; static const u32 N_FLAG = 1 << 31; static const u32 Z_FLAG = 1 << 30; static const u32 C_FLAG = 1 << 29; static const u32 V_FLAG = 1 << 28; Label *startConditional(ArmOp::Condition cc) { if (cc == ArmOp::AL) return nullptr; Label *label = new Label(); verify(cc <= ArmOp::LE); Condition condition = (Condition)((u32)cc ^ 1); B(label, condition); return label; } void endConditional(Label *label) { if (label != nullptr) { Bind(label); delete label; } } void call(void *loc) { ptrdiff_t offset = reinterpret_cast(loc) - reinterpret_cast(recompiler::writeToExec(GetBuffer()->GetStartAddress())); Label function_label; BindToOffset(&function_label, offset); Bl(&function_label); } Operand getOperand(const ArmOp::Operand& arg, const Register& scratch_reg) { Register rm; if (arg.isNone()) return Operand(); else if (arg.isImmediate()) { if (!arg.isShifted()) return Operand(arg.getImmediate()); Mov(scratch_reg, arg.getImmediate()); rm = scratch_reg; } else if (arg.isReg()) { rm = regalloc->map(arg.getReg().armreg); } Operand rv; if (!arg.shift_imm) { // Shift by register const Register& shift_reg = regalloc->map(arg.shift_reg.armreg); switch (arg.shift_type) { case ArmOp::LSL: case ArmOp::LSR: verify(!scratch_reg.Is(w0)); Mrs(x0, NZCV); Cmp(shift_reg, 32); if (arg.shift_type == ArmOp::LSL) Lsl(scratch_reg, rm, shift_reg); else Lsr(scratch_reg, rm, shift_reg); Csel(scratch_reg, 0, scratch_reg, ge); // LSL and LSR by 32 or more gives 0 Msr(NZCV, x0); break; case ArmOp::ASR: verify(!scratch_reg.Is(w0)); Mrs(x0, NZCV); Cmp(shift_reg, 32); Sbfx(w13, rm, 31, 1); Asr(scratch_reg, rm, shift_reg); Csel(scratch_reg, w13, scratch_reg, ge); // ASR by 32 or more gives 0 or -1 depending on operand sign Msr(NZCV, x0); break; case ArmOp::ROR: Ror(scratch_reg, rm, shift_reg); break; default: die("Invalid shift"); break; } rv = Operand(scratch_reg); } else { // Shift by immediate if (arg.shift_type != ArmOp::ROR && arg.shift_value != 0 && !logical_op_set_flags) { rv = Operand(rm, (Shift)arg.shift_type, arg.shift_value); } else if (arg.shift_value == 0) { if (arg.shift_type == ArmOp::LSL) { rv = Operand(rm); // LSL 0 is a no-op } else { // Shift by 32 if (logical_op_set_flags) set_carry_bit = true; if (arg.shift_type == ArmOp::LSR) { if (logical_op_set_flags) Ubfx(w14, rm, 31, 1); // w14 = rm[31] Mov(scratch_reg, 0); // scratch = 0 } else if (arg.shift_type == ArmOp::ASR) { if (logical_op_set_flags) Ubfx(w14, rm, 31, 1); // w14 = rm[31] Sbfx(scratch_reg, rm, 31, 1); // scratch = rm < 0 ? -1 : 0 } else if (arg.shift_type == ArmOp::ROR) { // RRX Cset(w14, cs); // w14 = C if (logical_op_set_flags) Mov(w13, rm); // save rm in case rm = scratch_reg Mov(scratch_reg, Operand(rm, LSR, 1)); // scratch = rm >> 1 Bfi(scratch_reg, w14, 31, 1); // scratch[31] = C if (logical_op_set_flags) Ubfx(w14, w13, 0, 1); // w14 = rm[0] (new C) } else die("Invalid shift"); rv = Operand(scratch_reg); } } else { // Carry must be preserved or Ror shift if (logical_op_set_flags) set_carry_bit = true; if (arg.shift_type == ArmOp::LSL) { Ubfx(w14, rm, 32 - arg.shift_value, 1); // w14 = rm[lsb] Lsl(scratch_reg, rm, arg.shift_value); // scratch <<= shift } else { if (logical_op_set_flags) Ubfx(w14, rm, arg.shift_value - 1, 1); // w14 = rm[msb] if (arg.shift_type == ArmOp::LSR) Lsr(scratch_reg, rm, arg.shift_value); // scratch >>= shift else if (arg.shift_type == ArmOp::ASR) Asr(scratch_reg, rm, arg.shift_value); else if (arg.shift_type == ArmOp::ROR) Ror(scratch_reg, rm, arg.shift_value); else die("Invalid shift"); } rv = Operand(scratch_reg); } } return rv; } const Register getRegister(const Register& scratch_reg, const Operand& op) { if (op.IsImmediate()) { Mov(scratch_reg, op.GetImmediate()); return scratch_reg; } else if (op.IsPlainRegister()) return op.GetRegister(); switch (op.GetShift()) { case LSL: Lsl(scratch_reg, op.GetRegister(), op.GetShiftAmount()); break; case LSR: Lsr(scratch_reg, op.GetRegister(), op.GetShiftAmount()); break; case ASR: Asr(scratch_reg, op.GetRegister(), op.GetShiftAmount()); break; case ROR: Ror(scratch_reg, op.GetRegister(), op.GetShiftAmount()); break; default: die("Invalid shift"); break; } return scratch_reg; } void loadFlags() { //Load flags Ldr(w0, arm_reg_operand(RN_PSR_FLAGS)); //move them to flags register Msr(NZCV, x0); } void storeFlags() { if (!set_flags) return; //get results from flags register Mrs(x1, NZCV); //Store flags Str(w1, arm_reg_operand(RN_PSR_FLAGS)); } void emitDataProcOp(const ArmOp& op) { Operand arg0 = getOperand(op.arg[0], w1); Register rn; Operand op2; if (op.op_type != ArmOp::MOV && op.op_type != ArmOp::MVN) { rn = getRegister(w1, arg0); if (!rn.Is(w1)) { Mov(w1, rn); rn = w1; } op2 = getOperand(op.arg[1], w2); } else op2 = arg0; WRegister rd; if (op.rd.isReg()) rd = regalloc->map(op.rd.getReg().armreg); if (logical_op_set_flags) { // When an Operand2 constant is used with the instructions MOVS, MVNS, ANDS, ORRS, ORNS, EORS, BICS, TEQ or TST, // the carry flag is updated to bit[31] of the constant, // if the constant is greater than 255 and can be produced by shifting an 8-bit value. if (op.arg[0].isImmediate() && op.arg[0].getImmediate() > 255) { set_carry_bit = true; Mov(w14, (op.arg[0].getImmediate() & 0x80000000) >> 31); } else if (op.arg[1].isImmediate() && op.arg[1].getImmediate() > 255) { set_carry_bit = true; Mov(w14, (op.arg[1].getImmediate() & 0x80000000) >> 31); } else if (!set_carry_bit) { // Logical ops should only affect the carry bit based on the op2 shift // Here we're not shifting so the carry bit should be preserved set_carry_bit = true; Cset(w14, cs); } } switch (op.op_type) { case ArmOp::AND: // AND if (set_flags) Ands(rd, rn, op2); else And(rd, rn, op2); break; case 1: // EOR Eor(rd, rn, op2); if (set_flags) Tst(rd, rd); break; case 2: // SUB if (set_flags) Subs(rd, rn, op2); else Sub(rd, rn, op2); break; case 3: // RSB Neg(w0, rn); if (set_flags) Adds(rd, w0, op2); else Add(rd, w0, op2); break; case 4: // ADD if (set_flags) Adds(rd, rn, op2); else Add(rd, rn, op2); break; case 12: // ORR Orr(rd, rn, op2); if (set_flags) Tst(rd, rd); break; case 14: // BIC if (set_flags) Bics(rd, rn, op2); else Bic(rd, rn, op2); break; case 5: // ADC if (set_flags) Adcs(rd, rn, op2); else Adc(rd, rn, op2); break; case 6: // SBC if (set_flags) Sbcs(rd, rn, op2); else Sbc(rd, rn, op2); break; case 7: // RSC Ngc(w0, rn); if (set_flags) Adds(rd, w0, op2); else Add(rd, w0, op2); break; case 8: // TST #ifdef STRICT_MODE // In armv7, TST and TEQ do not affect the V flag. // This isn't the case in armv8 so we need to save // and restore it. // Since this is a bit complicated/slow, let's assume nobody // relies on this. Cset(w3, vs); #endif Tst(rn, op2); #ifdef STRICT_MODE Mrs(x0, NZCV); Bfi(x0, x3, 28, 1); // V is bit 28 Msr(NZCV, x0); #endif break; case 9: // TEQ Eor(w0, rn, op2); #ifdef STRICT_MODE Cset(w3, vs); #endif Tst(w0, w0); #ifdef STRICT_MODE Mrs(x0, NZCV); Bfi(x0, x3, 28, 1); // V is bit 28 Msr(NZCV, x0); #endif break; case 10: // CMP Cmp(rn, op2); break; case 11: // CMN Cmn(rn, op2); break; case 13: // MOV Mov(rd, op2); if (set_flags) Tst(rd, rd); break; case 15: // MVN Mvn(rd, op2); if (set_flags) Tst(rd, rd); break; default: die("invalid op"); break; } if (set_carry_bit) { Mrs(x0, NZCV); Bfi(x0, x14, 29, 1); // C is bit 29 in NZCV Msr(NZCV, x0); } } void emitMemOp(const ArmOp& op) { Operand arg0 = getOperand(op.arg[0], w2); Register addr_reg = getRegister(w2, arg0); if (!w2.Is(addr_reg)) Mov(w2, addr_reg); if (op.pre_index) { const ArmOp::Operand& offset = op.arg[1]; Operand arg1 = getOperand(offset, w1); if (!arg1.IsImmediate()) { Register offset_reg = getRegister(w1, arg1); if (op.add_offset) Add(w2, w2, offset_reg); else Sub(w2, w2, offset_reg); } else if (offset.isImmediate() && offset.getImmediate() != 0) { if (op.add_offset) Add(w2, w2, offset.getImmediate()); else Sub(w2, w2, offset.getImmediate()); } } Mov(w0, w2); if (op.op_type == ArmOp::STR) { if (op.arg[2].isImmediate()) Mov(w1, op.arg[2].getImmediate()); else Mov(w1, regalloc->map(op.arg[2].getReg().armreg)); } call(recompiler::getMemOp(op.op_type == ArmOp::LDR, op.byte_xfer)); if (op.op_type == ArmOp::LDR) Mov(regalloc->map(op.rd.getReg().armreg), w0); } void emitBranch(const ArmOp& op) { if (op.arg[0].isImmediate()) Mov(w0, op.arg[0].getImmediate()); else { Mov(w0, regalloc->map(op.arg[0].getReg().armreg)); And(w0, w0, 0xfffffffc); } Str(w0, arm_reg_operand(R15_ARM_NEXT)); } void emitMRS(const ArmOp& op) { call((void*)CPUUpdateCPSR); if (op.spsr) Ldr(regalloc->map(op.rd.getReg().armreg), arm_reg_operand(RN_SPSR)); else Ldr(regalloc->map(op.rd.getReg().armreg), arm_reg_operand(RN_CPSR)); } void emitMSR(const ArmOp& op) { if (op.arg[0].isImmediate()) Mov(w0, op.arg[0].getImmediate()); else Mov(w0, regalloc->map(op.arg[0].getReg().armreg)); if (op.spsr) call((void*)recompiler::MSR_do<1>); else call((void*)recompiler::MSR_do<0>); } void emitFallback(const ArmOp& op) { set_flags = false; Mov(w0, op.arg[0].getImmediate()); call((void*)recompiler::interpret); } public: Arm7Compiler() : MacroAssembler((u8 *)recompiler::currentCode(), recompiler::spaceLeft()) {} void compile(const std::vector& block_ops, u32 cycles) { Ldr(w1, arm_reg_operand(CYCL_CNT)); Sub(w1, w1, cycles); Str(w1, arm_reg_operand(CYCL_CNT)); regalloc = new AArch64ArmRegAlloc(*this, block_ops); for (u32 i = 0; i < block_ops.size(); i++) { const ArmOp& op = block_ops[i]; DEBUG_LOG(AICA_ARM, "-> %s", op.toString().c_str()); set_flags = op.flags & ArmOp::OP_SETS_FLAGS; logical_op_set_flags = op.isLogicalOp() && set_flags; set_carry_bit = false; //bool save_v_flag = true; // FIXME is this needed? Label *condLabel = nullptr; if (op.flags & (ArmOp::OP_READS_FLAGS|ArmOp::OP_SETS_FLAGS)) loadFlags(); if (op.op_type != ArmOp::FALLBACK) condLabel = startConditional(op.condition); regalloc->load(i); if (op.op_type <= ArmOp::MVN) // data processing op emitDataProcOp(op); else if (op.op_type <= ArmOp::STR) // memory load/store emitMemOp(op); else if (op.op_type <= ArmOp::BL) // branch emitBranch(op); else if (op.op_type == ArmOp::MRS) emitMRS(op); else if (op.op_type == ArmOp::MSR) emitMSR(op); else if (op.op_type == ArmOp::FALLBACK) emitFallback(op); else die("invalid"); storeFlags(); regalloc->store(i); endConditional(condLabel); } ptrdiff_t offset = reinterpret_cast(arm_dispatch) - GetBuffer()->GetStartAddress(); Label arm_dispatch_label; BindToOffset(&arm_dispatch_label, offset); B(&arm_dispatch_label); FinalizeCode(); verify((size_t)GetBuffer()->GetCursorOffset() <= GetBuffer()->GetCapacity()); vmem_platform_flush_cache( recompiler::writeToExec(GetBuffer()->GetStartAddress()), recompiler::writeToExec(GetBuffer()->GetEndAddress()), GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress()); recompiler::advance(GetBuffer()->GetSizeInBytes()); #if 0 Instruction* instr_start = (Instruction *)codestart; Instruction* instr_end = GetBuffer()->GetEndAddress(); Decoder decoder; Disassembler disasm; decoder.AppendVisitor(&disasm); Instruction* instr; for (instr = instr_start; instr < instr_end; instr += kInstructionSize) { decoder.Decode(instr); DEBUG_LOG(AICA_ARM, "arm64 arec\t %p:\t%s", reinterpret_cast(instr), disasm.GetOutput()); } #endif delete regalloc; regalloc = nullptr; } void generateMainLoop() { if (!recompiler::empty()) { verify(arm_mainloop != nullptr); verify(arm_compilecode != nullptr); return; } Label arm_dispatch_label; Label arm_dofiq; Label arm_exit; // For stack unwinding purposes, we pretend that the entire code block is a single function unwinder.start(GetCursorAddress()); // arm_compilecode: arm_compilecode = (void (*)())recompiler::writeToExec(GetCursorAddress()); call((void*)recompiler::compile); B(&arm_dispatch_label); // arm_mainloop(regs, entry points) arm_mainloop = (arm_mainloop_t)recompiler::writeToExec(GetCursorAddress()); Stp(x25, x26, MemOperand(sp, -96, AddrMode::PreIndex)); unwinder.allocStack(0, 96); unwinder.saveReg(0, x25, 96); unwinder.saveReg(0, x26, 88); Stp(x27, x28, MemOperand(sp, 16)); unwinder.saveReg(0, x27, 80); unwinder.saveReg(0, x28, 72); Stp(x29, x30, MemOperand(sp, 32)); unwinder.saveReg(0, x29, 64); unwinder.saveReg(0, x30, 56); Stp(x19, x20, MemOperand(sp, 48)); unwinder.saveReg(0, x19, 48); unwinder.saveReg(0, x20, 40); Stp(x21, x22, MemOperand(sp, 64)); unwinder.saveReg(0, x21, 32); unwinder.saveReg(0, x22, 24); Stp(x23, x24, MemOperand(sp, 80)); unwinder.saveReg(0, x23, 16); unwinder.saveReg(0, x24, 8); Mov(x28, x0); // arm7 registers Mov(x26, x1); // lookup base // arm_dispatch: Bind(&arm_dispatch_label); arm_dispatch = GetCursorAddress(); Ldr(w3, arm_reg_operand(CYCL_CNT)); // load cycle counter Ldp(w0, w1, arm_reg_operand(R15_ARM_NEXT)); // load Next PC, interrupt Tbnz(w3, 31, &arm_exit); // exit if cycle counter negative Ubfx(w2, w0, 2, 21); // w2 = pc >> 2. Note: assuming address space == 8 MB (23 bits) Cbnz(w1, &arm_dofiq); // if interrupt pending, handle it Add(x2, x26, Operand(x2, Shift::LSL, 3)); // x2 = EntryPoints + pc << 1 Ldr(x3, MemOperand(x2)); Br(x3); // arm_dofiq: Bind(&arm_dofiq); call((void*)CPUFiq); B(&arm_dispatch_label); // arm_exit: Bind(&arm_exit); Ldp(x23, x24, MemOperand(sp, 80)); Ldp(x21, x22, MemOperand(sp, 64)); Ldp(x19, x20, MemOperand(sp, 48)); Ldp(x29, x30, MemOperand(sp, 32)); Ldp(x27, x28, MemOperand(sp, 16)); Ldp(x25, x26, MemOperand(sp, 96, AddrMode::PostIndex)); Ret(); FinalizeCode(); size_t unwindSize = unwinder.end(recompiler::spaceLeft() - 128, (ptrdiff_t)recompiler::writeToExec(nullptr)); verify(unwindSize <= 128); vmem_platform_flush_cache( recompiler::writeToExec(GetBuffer()->GetStartAddress()), recompiler::writeToExec(GetBuffer()->GetEndAddress()), GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress()); recompiler::advance(GetBuffer()->GetSizeInBytes()); } }; void AArch64ArmRegAlloc::LoadReg(int host_reg, Arm7Reg armreg) { assembler.Ldr(getReg(host_reg), arm_reg_operand(armreg)); } void AArch64ArmRegAlloc::StoreReg(int host_reg, Arm7Reg armreg) { assembler.Str(getReg(host_reg), arm_reg_operand(armreg)); } void arm7backend_compile(const std::vector& block_ops, u32 cycles) { Arm7Compiler assembler; assembler.compile(block_ops, cycles); } void arm7backend_flush() { unwinder.clear(); Arm7Compiler assembler; assembler.generateMainLoop(); } } #endif // ARM64