From a9a2aad8f610a13d6f50cc8a761d8010fa4162dc Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Wed, 9 Jan 2019 16:35:23 +0100 Subject: [PATCH] arm64: use register spans allocation. Implement some opcodes natively --- core/hw/sh4/interpr/sh4_opcodes.cpp | 2 +- core/rec-ARM64/arm64_regalloc.h | 84 ++++++ core/rec-ARM64/rec_arm64.cpp | 429 +++++++++++++++++++++++----- 3 files changed, 436 insertions(+), 79 deletions(-) create mode 100644 core/rec-ARM64/arm64_regalloc.h diff --git a/core/hw/sh4/interpr/sh4_opcodes.cpp b/core/hw/sh4/interpr/sh4_opcodes.cpp index b71be433e..7f200dae1 100644 --- a/core/hw/sh4/interpr/sh4_opcodes.cpp +++ b/core/hw/sh4/interpr/sh4_opcodes.cpp @@ -1298,7 +1298,7 @@ INLINE void DYNACALL do_sqw(u32 Dest) } void DYNACALL do_sqw_mmu(u32 dst) { do_sqw(dst); } -#if HOST_CPU!=CPU_ARM +#if HOST_CPU != CPU_ARM && HOST_CPU != CPU_ARM64 //yes, this micro optimization makes a difference extern "C" void DYNACALL do_sqw_nommu_area_3(u32 dst,u8* sqb) { diff --git a/core/rec-ARM64/arm64_regalloc.h b/core/rec-ARM64/arm64_regalloc.h new file mode 100644 index 000000000..48a258c1b --- /dev/null +++ b/core/rec-ARM64/arm64_regalloc.h @@ -0,0 +1,84 @@ +/* + Copyright 2019 flyinghead + + This file is part of reicast. + + reicast is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + reicast is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with reicast. If not, see . + */ + +#ifndef CORE_REC_ARM64_ARM64_REGALLOC_H_ +#define CORE_REC_ARM64_ARM64_REGALLOC_H_ + + +#include "hw/sh4/dyna/regalloc.h" +#include "deps/vixl/aarch64/macro-assembler-aarch64.h" +using namespace vixl::aarch64; + +enum eReg { + W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, W16, + W17, W18, W19, W20, W21, W22, W23, W24, W25, W26, W27, W28, W29, W30 +}; +enum eFReg { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, + S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31 +}; + +#if HOST_OS == OS_DARWIN +#error "TODO static eReg alloc_regs[]={r5,r6,r7,r10,(eReg)-1};" // TODO +#else +static eReg alloc_regs[] = { W19, W20, W21, W22, W23, W24, W25, W26, (eReg)-1 }; +#endif +static eFReg alloc_fregs[] = { S8, S9, S10, S11, S12, S13, S14, S15, (eFReg)-1 }; + +class Arm64Assembler; + +struct Arm64RegAlloc : RegAlloc +{ + Arm64RegAlloc(Arm64Assembler *assembler) : assembler(assembler) {} + + void DoAlloc(RuntimeBlockInfo* block) + { + RegAlloc::DoAlloc(block, alloc_regs, alloc_fregs); + } + + virtual void Preload(u32 reg, eReg nreg) override; + virtual void Writeback(u32 reg, eReg nreg) override; + virtual void Preload_FPU(u32 reg, eFReg nreg) override; + virtual void Writeback_FPU(u32 reg, eFReg nreg) override; + + const Register& MapRegister(const shil_param& param) + { + eReg ereg = mapg(param); + if (ereg == (eReg)-1) + die("Register not allocated"); + return Register::GetWRegFromCode(ereg); + } + + const VRegister& MapVRegister(const shil_param& param) + { + eFReg ereg = mapf(param); + if (ereg == (eFReg)-1) + die("VRegister not allocated"); + return VRegister::GetSRegFromCode(ereg); + } + + Arm64Assembler *assembler; +}; + +extern Arm64RegAlloc reg; + + + + +#endif /* CORE_REC_ARM64_ARM64_REGALLOC_H_ */ diff --git a/core/rec-ARM64/rec_arm64.cpp b/core/rec-ARM64/rec_arm64.cpp index 9847074e6..749cfd8a8 100644 --- a/core/rec-ARM64/rec_arm64.cpp +++ b/core/rec-ARM64/rec_arm64.cpp @@ -1,10 +1,28 @@ +/* + Copyright 2019 flyinghead + + This file is part of reicast. + + reicast is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + reicast is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with reicast. If not, see . + */ + #include "types.h" #if FEAT_SHREC == DYNAREC_JIT #include #include -#include #include "deps/vixl/aarch64/macro-assembler-aarch64.h" using namespace vixl::aarch64; @@ -16,6 +34,7 @@ using namespace vixl::aarch64; #include "hw/sh4/sh4_core.h" #include "hw/sh4/dyna/ngen.h" #include "hw/sh4/sh4_mem.h" +#include "arm64_regalloc.h" struct DynaRBI : RuntimeBlockInfo { @@ -75,26 +94,75 @@ static void ngen_FailedToFindBlock_internal() { void(*ngen_FailedToFindBlock)() = &ngen_FailedToFindBlock_internal; -static int cycle_counter; +extern "C" { + +void *bm_GetCodeInternal(u32 pc) +{ + return (void*)bm_GetCode(pc); +} + +void UpdateSystemInternal(u32 pc) +{ + if (UpdateSystem()) + rdv_DoInterrupts_pc(pc); +} + +} void ngen_mainloop(void* v_cntx) { Sh4RCB* ctx = (Sh4RCB*)((u8*)v_cntx - sizeof(Sh4RCB)); - cycle_counter = SH4_TIMESLICE; + __asm__ volatile + ( + "stp x19, x20, [sp, #-144]! \n\t" + "stp x21, x22, [sp, #16] \n\t" + "stp x23, x24, [sp, #32] \n\t" + "stp x25, x26, [sp, #48] \n\t" + "stp x27, x28, [sp, #64] \n\t" + "stp s8, s9, [sp, #80] \n\t" + "stp s10, s11, [sp, #96] \n\t" + "stp s12, s13, [sp, #112] \n\t" + "stp s14, s15, [sp, #128] \n\t" + // Use x28 as sh4 context pointer + "mov x28, %0 \n\t" + // Use x27 as cycle_counter + "mov w27, %2 \n\t" // SH4_TIMESLICE - while (sh4_int_bCpuRun) { - do { - DynarecCodeEntryPtr rcb = bm_GetCode(ctx->cntx.pc); - rcb(); - } while (cycle_counter > 0); + "run_loop: \n\t" + "ldr w0, [x28, %3] \n\t" // CpuRunning + "cmp w0, #0 \n\t" + "b.eq end_run_loop \n\t" - cycle_counter += SH4_TIMESLICE; + "slice_loop: \n\t" + "ldr w0, [x28, %1] \n\t" // pc + "bl bm_GetCodeInternal \n\t" + "blr x0 \n\t" + "cmp w27, #0 \n\t" + "b.gt slice_loop \n\t" - if (UpdateSystem()) { - rdv_DoInterrupts_pc(ctx->cntx.pc); - } - } + "add w27, w27, %2 \n\t" // SH4_TIMESLICE + "ldr w0, [x28, %1] \n\t" // pc + "bl UpdateSystemInternal \n\t" + "b run_loop \n\t" + + "end_run_loop: \n\t" + "ldp s14, s15, [sp, #128] \n\t" + "ldp s12, s13, [sp, #112] \n\t" + "ldp s10, s11, [sp, #96] \n\t" + "ldp s8, s9, [sp, #80] \n\t" + "ldp x27, x28, [sp, #64] \n\t" + "ldp x25, x26, [sp, #48] \n\t" + "ldp x23, x24, [sp, #32] \n\t" + "ldp x21, x22, [sp, #16] \n\t" + "ldp x19, x20, [sp], #144 \n\t" + : + : "r"(reinterpret_cast(&ctx->cntx)), + "i"(offsetof(Sh4Context, pc)), + "i"(SH4_TIMESLICE), + "i"(offsetof(Sh4Context, CpuRunning)) + : "memory" + ); } void ngen_init() @@ -128,8 +196,12 @@ void ngen_blockcheckfail(u32 pc) { class Arm64Assembler : public MacroAssembler { + typedef void (MacroAssembler::*Arm64Op)(const Register&, const Register&, const Operand&); + typedef void (MacroAssembler::*Arm64Op2)(const Register&, const Register&, const Register&); + typedef void (MacroAssembler::*Arm64Op3)(const Register&, const Register&, const Operand&, enum FlagsUpdate); + public: - Arm64Assembler() : MacroAssembler((u8 *)emit_GetCCPtr(), 64 * 1024) + Arm64Assembler() : MacroAssembler((u8 *)emit_GetCCPtr(), 64 * 1024), regalloc(this) { call_regs.push_back(&w0); call_regs.push_back(&w1); @@ -159,24 +231,45 @@ public: call_fregs.push_back(&s7); } + void ngen_BinaryOp(shil_opcode* op, Arm64Op arm_op, Arm64Op2 arm_op2, Arm64Op3 arm_op3) + { + const Register* reg3 = &wzr; + if (op->rs2.is_imm()) + { + Mov(w10, op->rs2._imm); + reg3 = &w10; + } + else if (op->rs2.is_r32i()) + { + reg3 = ®alloc.MapRegister(op->rs2); + } + if (arm_op != NULL) + ((*this).*arm_op)(regalloc.MapRegister(op->rd), regalloc.MapRegister(op->rs1), *reg3); + else if (arm_op2 != NULL) + ((*this).*arm_op2)(regalloc.MapRegister(op->rd), regalloc.MapRegister(op->rs1), *reg3); + else + ((*this).*arm_op3)(regalloc.MapRegister(op->rd), regalloc.MapRegister(op->rs1), *reg3, LeaveFlags); + } + void ngen_Compile(RuntimeBlockInfo* block, bool force_checks, bool reset, bool staging, bool optimise) { //printf("REC-ARM64 compiling %08x\n", block->addr); if (force_checks) CheckBlock(block); - Stp(x28, x30, MemOperand(sp, -16, PreIndex)); - // Use x28 as sh4 context pointer - Mov(x28, reinterpret_cast(&p_sh4rcb->cntx)); + Str(x30, MemOperand(sp, -16, PreIndex)); - Mov(x9, reinterpret_cast(&cycle_counter)); - Ldr(w10, MemOperand(x9)); - Sub(w10, w10, block->guest_cycles); - Str(w10, MemOperand(x9)); + // run register allocator + regalloc.DoAlloc(block); + + // scheduler + Sub(w27, w27, block->guest_cycles); for (size_t i = 0; i < block->oplist.size(); i++) { shil_opcode& op = block->oplist[i]; + regalloc.OpBegin(&op, i); + switch (op.op) { case shop_ifb: // Interpreter fallback @@ -188,28 +281,42 @@ public: Mov(*call_regs[0], op.rs3._imm); CallRuntime(OpDesc[op.rs3._imm]->oph); - reg_cache.clear(); break; case shop_jcond: case shop_jdyn: - Ldr(w10, sh4_context_mem_operand(op.rs1.reg_ptr())); + Mov(w10, regalloc.MapRegister(op.rs1)); if (op.rs2.is_imm()) { Mov(w9, op.rs2._imm); Add(w10, w10, w9); } - Str(w10, sh4_context_mem_operand(op.rd.reg_ptr())); - reg_cache.clear(); + Mov(regalloc.MapRegister(op.rd), w10); break; case shop_mov32: verify(op.rd.is_reg()); verify(op.rs1.is_reg() || op.rs1.is_imm()); - shil_param_to_host_reg(op.rs1, w10); - host_reg_to_shil_param(op.rd, w10); + if (regalloc.IsAllocf(op.rd)) + { + if (op.rs1.is_imm()) + Fmov(regalloc.MapVRegister(op.rd), (float&)op.rs1._imm); + else if (regalloc.IsAllocf(op.rs1)) + Fmov(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1)); + else + Fmov(regalloc.MapVRegister(op.rd), regalloc.MapRegister(op.rs1)); + } + else + { + if (op.rs1.is_imm()) + Mov(regalloc.MapRegister(op.rd), op.rs1._imm); + else if (regalloc.IsAllocg(op.rs1)) + Mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); + else + Fmov(regalloc.MapRegister(op.rd), regalloc.MapVRegister(op.rs1)); + } break; case shop_mov64: @@ -227,7 +334,6 @@ public: { shil_param_to_host_reg(op.rs3, w10); Add(*call_regs[0], *call_regs[0], w10); - flush_reg_cache(*call_regs[0]); } u32 size = op.flags & 0x7f; @@ -256,7 +362,6 @@ public: die("1..8 bytes"); break; } - reg_cache.clear(); if (size != 8) host_reg_to_shil_param(op.rd, w0); @@ -272,7 +377,6 @@ public: { shil_param_to_host_reg(op.rs3, w10); Add(*call_regs[0], *call_regs[0], w10); - flush_reg_cache(*call_regs[0]); } u32 size = op.flags & 0x7f; @@ -303,14 +407,150 @@ public: die("1..8 bytes"); break; } - reg_cache.clear(); } break; + case shop_sync_sr: + CallRuntime(UpdateSR); + break; + + case shop_neg: + Neg(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); + break; + case shop_not: + Mvn(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); + break; + + case shop_shl: + ngen_BinaryOp(&op, NULL, &MacroAssembler::Lsl, NULL); + break; + case shop_shr: + ngen_BinaryOp(&op, NULL, &MacroAssembler::Lsr, NULL); + break; + case shop_sar: + ngen_BinaryOp(& op, NULL, &MacroAssembler::Asr, NULL); + break; + case shop_and: + ngen_BinaryOp(&op, &MacroAssembler::And, NULL, NULL); + break; + case shop_or: + ngen_BinaryOp(&op, &MacroAssembler::Orr, NULL, NULL); + break; + case shop_xor: + ngen_BinaryOp(&op, &MacroAssembler::Eor, NULL, NULL); + break; + case shop_add: + ngen_BinaryOp(&op, NULL, NULL, &MacroAssembler::Add); + break; + case shop_sub: + ngen_BinaryOp(&op, NULL, NULL, &MacroAssembler::Sub); + break; + case shop_ror: + ngen_BinaryOp(&op, NULL, &MacroAssembler::Ror, NULL); + break; + + case shop_adc: + Cmp(regalloc.MapRegister(op.rs3), 1); // C = rs3 + Adcs(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); // (C,rd)=rs1+rs2+rs3(C) + Cset(regalloc.MapRegister(op.rd2), cs); // rd2 = C + break; +/* TODO + case shop_rocl: + Orr(reg.mapg(op->rd),reg.mapg(op->rs2),reg.mapg(op->rs1),true, S_LSL, 1); //(C,rd)= rs1<<1 + (|) rs2 + MOVW(reg.mapg(op->rd2),0); //clear rd2 (for ADC/MOVCS) + ADC(reg.mapg(op->rd2),reg.mapg(op->rd2),0); //rd2=C (or MOVCS rd2, 1) + } + break; +*/ + case shop_mul_u16: + Uxth(w10, regalloc.MapRegister(op.rs1)); + Uxth(w11, regalloc.MapRegister(op.rs2)); + Mul(regalloc.MapRegister(op.rd), w10, w11); + break; + case shop_mul_s16: + Sxth(w10, regalloc.MapRegister(op.rs1)); + Sxth(w11, regalloc.MapRegister(op.rs2)); + Mul(regalloc.MapRegister(op.rd), w10, w11); + break; + case shop_mul_i32: + Mul(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); + break; + case shop_mul_u64: + case shop_mul_s64: + { + const Register& rd_xreg = Register::GetXRegFromCode(regalloc.MapRegister(op.rd).GetCode()); + if (op.op == shop_mul_u64) + Umull(rd_xreg, regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); + else + Smull(rd_xreg, regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); + const Register& rd2_xreg = Register::GetXRegFromCode(regalloc.MapRegister(op.rd2).GetCode()); + Lsr(rd2_xreg, rd_xreg, 32); + } + break; + + case shop_ext_s8: + Sxtb(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); + break; + case shop_ext_s16: + Sxth(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); + break; + + // + // FPU + // + + case shop_fadd: + Fadd(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1), regalloc.MapVRegister(op.rs2)); + break; + case shop_fsub: + Fsub(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1), regalloc.MapVRegister(op.rs2)); + break; + case shop_fmul: + Fmul(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1), regalloc.MapVRegister(op.rs2)); + break; + case shop_fdiv: + Fdiv(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1), regalloc.MapVRegister(op.rs2)); + break; + + case shop_fabs: + Fabs(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1)); + break; + case shop_fneg: + Fneg(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1)); + break; + case shop_fsqrt: + Fsqrt(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1)); + break; + + case shop_fmac: + Fmadd(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs3), regalloc.MapVRegister(op.rs2), regalloc.MapVRegister(op.rs1)); + break; + + case shop_fsrra: + Fsqrt(s0, regalloc.MapVRegister(op.rs1)); + Fmov(s1, 1.f); + Fdiv(regalloc.MapVRegister(op.rd), s1, s0); + break; + + case shop_fsetgt: + case shop_fseteq: + Fcmp(regalloc.MapVRegister(op.rs1), regalloc.MapVRegister(op.rs2)); + Cset(regalloc.MapRegister(op.rd), op.op == shop_fsetgt ? gt : eq); + break; + + case shop_cvt_f2i_t: + Fcvtzs(regalloc.MapRegister(op.rd), regalloc.MapVRegister(op.rs1)); + break; + case shop_cvt_i2f_n: + case shop_cvt_i2f_z: + Scvtf(regalloc.MapVRegister(op.rd), regalloc.MapRegister(op.rs1)); + break; + default: shil_chf[op.op](&op); break; } + regalloc.OpEnd(&op); } switch (block->BlockType) @@ -377,8 +617,7 @@ public: die("Invalid block end type"); } - - Ldp(x28, x30, MemOperand(sp, 16, PostIndex)); + Ldr(x30, MemOperand(sp, 16, PostIndex)); Ret(); Label code_end; @@ -464,7 +703,7 @@ public: case CPT_f32: if (prm.is_reg()) { - Ldr(*call_fregs[fregused], sh4_context_mem_operand(prm.reg_ptr())); + Fmov(*call_fregs[fregused], regalloc.MapVRegister(prm)); } else { verify(prm.is_null()); @@ -474,7 +713,6 @@ public: case CPT_ptr: verify(prm.is_reg()); - flush_reg_cache(*call_regs64[regused]); // push the ptr itself Mov(*call_regs64[regused++], reinterpret_cast(prm.reg_ptr())); @@ -488,7 +726,13 @@ public: } } CallRuntime((void (*)())function); - reg_cache.clear(); + } + + MemOperand sh4_context_mem_operand(void *p) + { + u32 offset = (u8*)p - (u8*)&p_sh4rcb->cntx; + verify((offset & 3) == 0 && offset <= 16380); // FIXME 64-bit regs need multiple of 8 up to 32760 + return MemOperand(x28, offset); } private: @@ -508,7 +752,7 @@ private: while (sz > 0) { - if (sz >= 8) + if (sz >= 8 && (reinterpret_cast(ptr) & 7) == 0) { Ldr(x10, MemOperand(x9, 8, PostIndex)); Ldr(x11, *(u64*)ptr); @@ -516,7 +760,7 @@ private: sz -= 8; ptr += 8; } - else if (sz >= 4) + else if (sz >= 4 && (reinterpret_cast(ptr) & 3) == 0) { Ldr(w10, MemOperand(x9, 4, PostIndex)); Ldr(w11, *(u32*)ptr); @@ -543,60 +787,51 @@ private: Bind(&blockcheck_success); } - MemOperand sh4_context_mem_operand(void *p) - { - u32 offset = (u8*)p - (u8*)&p_sh4rcb->cntx; - verify((offset & 3) == 0 && offset <= 16380); // FIXME 64-bit regs need multiple of 8 up to 32760 - return MemOperand(x28, offset); - } - void shil_param_to_host_reg(const shil_param& param, const Register& reg) { - if (param.is_imm()) { + if (param.is_imm()) + { Mov(reg, param._imm); - flush_reg_cache(reg); } - else if (param.is_reg()) { - const Register *cached_reg = reg_cache[param._reg]; - if (cached_reg != NULL) - { - if (cached_reg != ®) - { - Mov(reg, *cached_reg); - set_reg_cache(param._reg, reg); - } - } - else + else if (param.is_reg()) + { + if (param.is_r64f()) { + // TODO use regalloc Ldr(reg, sh4_context_mem_operand(param.reg_ptr())); - set_reg_cache(param._reg, reg); } + else if (param.is_r32f()) + Fmov(reg, regalloc.MapVRegister(param)); + else + Mov(reg, regalloc.MapRegister(param)); } - else { + else + { verify(param.is_null()); } } void host_reg_to_shil_param(const shil_param& param, const CPURegister& reg) { - Str(reg, sh4_context_mem_operand(param.reg_ptr())); - if (reg.IsRegister()) - set_reg_cache(param._reg, (const Register&)reg); - } - - void set_reg_cache(Sh4RegType sh4_reg_type, const Register& reg) - { - flush_reg_cache(reg); - reg_cache[sh4_reg_type] = ® - } - - void flush_reg_cache(const Register& reg) - { - for (auto it = reg_cache.begin(); it != reg_cache.end();) - if (it->second != NULL && it->second->GetCode() == reg.GetCode()) - it = reg_cache.erase(it); + if (reg.Is64Bits()) + { + // TODO use regalloc + Str((const Register&)reg, sh4_context_mem_operand(param.reg_ptr())); + } + else if (regalloc.IsAllocg(param)) + { + if (reg.IsRegister()) + Mov(regalloc.MapRegister(param), (const Register&)reg); else - it++; + Fmov(regalloc.MapRegister(param), (const VRegister&)reg); + } + else + { + if (reg.IsVRegister()) + Fmov(regalloc.MapVRegister(param), (const VRegister&)reg); + else + Fmov(regalloc.MapVRegister(param), (const Register&)reg); + } } struct CC_PS @@ -608,7 +843,7 @@ private: std::vector call_regs; std::vector call_regs64; std::vector call_fregs; - std::map reg_cache; + Arm64RegAlloc regalloc; }; static Arm64Assembler* compiler; @@ -645,4 +880,42 @@ void ngen_CC_Finish(shil_opcode* op) } +void Arm64RegAlloc::Preload(u32 reg, eReg nreg) +{ + assembler->Ldr(Register(nreg, 32), assembler->sh4_context_mem_operand(GetRegPtr(reg))); +} +void Arm64RegAlloc::Writeback(u32 reg, eReg nreg) +{ + assembler->Str(Register(nreg, 32), assembler->sh4_context_mem_operand(GetRegPtr(reg))); +} +void Arm64RegAlloc::Preload_FPU(u32 reg, eFReg nreg) +{ + assembler->Ldr(VRegister(nreg, 32), assembler->sh4_context_mem_operand(GetRegPtr(reg))); +} +void Arm64RegAlloc::Writeback_FPU(u32 reg, eFReg nreg) +{ + assembler->Str(VRegister(nreg, 32), assembler->sh4_context_mem_operand(GetRegPtr(reg))); +} + + +extern "C" void do_sqw_nommu_area_3(u32 dst, u8* sqb) +{ + __asm__ volatile + ( + "movz x11, #0x0C00, lsl #16 \n\t" + "add x11, x1, x11 \n\t" // get ram ptr from x1, part 1 + "and x12, x0, #0x20 \n\t" // SQ# selection, isolate + "ubfx x0, x0, #5, #20 \n\t" // get ram offset + "add x1, x12, x1 \n\t" // SQ# selection, add to SQ ptr + "add x11, x11, #512 \n\t" // get ram ptr from x1, part 2 + "add x11, x11, x0, lsl #5 \n\t" // ram + offset + "ldp x9, x10, [x1], #16 \n\t" + "stp x9, x10, [x11], #16 \n\t" + "ldp x9, x10, [x1] \n\t" + "stp x9, x10, [x11] \n\t" + "ret \n" + + : : : "memory" + ); +} #endif // FEAT_SHREC == DYNAREC_JIT