/* This file is part of Flycast. Flycast is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. Flycast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Flycast. If not, see . */ #include "types.h" #if FEAT_SHREC == DYNAREC_JIT && HOST_CPU == CPU_ARM #include #include #include #include using namespace vixl::aarch32; #include "hw/sh4/sh4_opcode_list.h" #include "hw/sh4/sh4_mmr.h" #include "hw/sh4/sh4_rom.h" #include "hw/sh4/sh4_interrupts.h" #include "hw/sh4/sh4_core.h" #include "hw/sh4/dyna/ngen.h" #include "hw/sh4/dyna/ssa_regalloc.h" #include "hw/sh4/sh4_mem.h" #include "cfg/option.h" #include "arm_unwind.h" #include "oslib/virtmem.h" //#define CANONICALTEST /* ARM ABI r0~r1: scratch, params, return r2~r3: scratch, params 8 regs, v6 is platform dependent r4~r11 r12 is "The Intra-Procedure-call scratch register" r13 stack r14 link r15 pc Registers f0-s15 (d0-d7, q0-q3) do not need to be preserved (and can be used for passing arguments or returning results in standard procedure-call variants). Registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls; Registers d16-d31 (q8-q15), if present, do not need to be preserved. Block linking Reg alloc r0~r4: scratch r5,r6,r7,r9,r10,r11: allocated r8: sh4 cntx fpu reg alloc d8:d15, single storage */ #ifdef __clang__ extern "C" char *stpcpy(char *dst, char const *src) { size_t src_len = strlen(src); return (char *)memcpy(dst, src, src_len) + src_len; } #endif #undef do_sqw_nommu #define rcbOffset(x) (-sizeof(Sh4RCB) + offsetof(Sh4RCB, x)) struct DynaRBI: RuntimeBlockInfo { virtual u32 Relink(); Register T_reg; }; using FPBinOP = void (MacroAssembler::*)(DataType, SRegister, SRegister, SRegister); using FPUnOP = void (MacroAssembler::*)(DataType, SRegister, SRegister); using BinaryOP = void (MacroAssembler::*)(Register, Register, const Operand&); class Arm32Assembler : public MacroAssembler { public: Arm32Assembler() = default; Arm32Assembler(u8 *buffer, size_t size) : MacroAssembler(buffer, size, A32) {} void Finalize() { FinalizeCode(); virtmem::flush_cache(GetBuffer()->GetStartAddress(), GetCursorAddress() - 1, GetBuffer()->GetStartAddress(), GetCursorAddress() - 1); } }; static Arm32Assembler ass; static ArmUnwindInfo unwinder; std::map ArmUnwindInfo::fdes; static void loadSh4Reg(Register Rt, u32 Sh4_Reg) { const int shRegOffs = (u8*)GetRegPtr(Sh4_Reg) - (u8*)&p_sh4rcb->cntx - sizeof(Sh4cntx); ass.Ldr(Rt, MemOperand(r8, shRegOffs)); } static void storeSh4Reg(Register Rt, u32 Sh4_Reg) { const int shRegOffs = (u8*)GetRegPtr(Sh4_Reg) - (u8*)&p_sh4rcb->cntx - sizeof(Sh4cntx); ass.Str(Rt, MemOperand(r8, shRegOffs)); } const int alloc_regs[] = { 5, 6, 7, 9, 10, 11, -1 }; const int alloc_regs_mmu[] = { 5, 6, 7, 10, 11, -1 }; const int alloc_fpu[] = { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, -1 }; struct arm_reg_alloc: RegAlloc { void Preload(u32 reg, int nreg) override { loadSh4Reg(Register(nreg), reg); } void Writeback(u32 reg, int nreg) override { if (reg == reg_pc_dyn) // reg_pc_dyn has been stored in r4 by the jdyn op implementation // No need to write it back since it won't be used past the end of the block ; //ass.Mov(r4, Register(nreg)); else storeSh4Reg(Register(nreg), reg); } void Preload_FPU(u32 reg, int nreg) override { const s32 shRegOffs = (u8*)GetRegPtr(reg) - (u8*)&p_sh4rcb->cntx - sizeof(Sh4cntx); ass.Vldr(SRegister(nreg), MemOperand(r8, shRegOffs)); } void Writeback_FPU(u32 reg, int nreg) override { const s32 shRegOffs = (u8*)GetRegPtr(reg) - (u8*)&p_sh4rcb->cntx - sizeof(Sh4cntx); ass.Vstr(SRegister(nreg), MemOperand(r8, shRegOffs)); } SRegister mapFReg(const shil_param& prm, int index = 0) { return SRegister(mapf(prm, index)); } Register mapReg(const shil_param& prm) { return Register(mapg(prm)); } }; static arm_reg_alloc reg; static const void *no_update; static const void *intc_sched; static const void *ngen_blockcheckfail; static const void *ngen_LinkBlock_Generic_stub; static const void *ngen_LinkBlock_cond_Branch_stub; static const void *ngen_LinkBlock_cond_Next_stub; static void (*ngen_FailedToFindBlock_)(); static void (*mainloop)(void *); static void (*handleException)(); static void (*checkBlockFpu)(); static void (*checkBlockNoFpu)(); static void generate_mainloop(); static std::map ccmap; static std::map ccnmap; static bool restarting; static u32 jmp_stack; void ngen_mainloop(void* context) { do { restarting = false; generate_mainloop(); mainloop(context); if (restarting) p_sh4rcb->cntx.CpuRunning = 1; } while (restarting); } static void jump(const void *code, ConditionType cond = al) { ptrdiff_t offset = reinterpret_cast(code) - ass.GetBuffer()->GetStartAddress(); verify((offset & 3) == 0); if (offset < -32 * 1024 * 1024 || offset >= 32 * 1024 * 1024) { WARN_LOG(DYNAREC, "jump offset too large: %d", offset); UseScratchRegisterScope scope(&ass); Register reg = scope.Acquire(); ass.Mov(cond, reg, (u32)code); ass.Bx(cond, reg); } else { Label code_label(offset); ass.B(cond, &code_label); } } static void call(const void *code, ConditionType cond = al) { ptrdiff_t offset = reinterpret_cast(code) - ass.GetBuffer()->GetStartAddress(); verify((offset & 3) == 0); if (offset < -32 * 1024 * 1024 || offset >= 32 * 1024 * 1024) { WARN_LOG(DYNAREC, "call offset too large: %d", offset); UseScratchRegisterScope scope(&ass); Register reg = scope.Acquire(); ass.Mov(cond, reg, (u32)code); ass.Blx(cond, reg); } else { Label code_label(offset); ass.Bl(cond, &code_label); } } static u32 relinkBlock(DynaRBI *block) { u32 start_offset = ass.GetCursorOffset(); switch(block->BlockType) { case BET_Cond_0: case BET_Cond_1: { //quick opt here: //peek into reg alloc, store actual sr_T register to relink_data #ifndef CANONICALTEST bool last_op_sets_flags = !block->has_jcond && !block->oplist.empty() && block->oplist.back().rd._reg == reg_sr_T && ccmap.count(block->oplist.back().op); #else bool last_op_sets_flags = false; #endif ConditionType CC = eq; if (last_op_sets_flags) { shilop op = block->oplist.back().op; if ((block->BlockType & 1) == 1) CC = ccmap[op]; else CC = ccnmap[op]; } else { if (!block->has_jcond) { if (block->T_reg.IsRegister()) { ass.Mov(r4, block->T_reg); } else { INFO_LOG(DYNAREC, "SLOW COND PATH %x", block->oplist.empty() ? -1 : block->oplist.back().op); loadSh4Reg(r4, reg_sr_T); } } ass.Cmp(r4, block->BlockType & 1); } if (!mmu_enabled()) { if (block->pBranchBlock) jump((void *)block->pBranchBlock->code, CC); else call(ngen_LinkBlock_cond_Branch_stub, CC); if (block->pNextBlock) jump((void *)block->pNextBlock->code); else call(ngen_LinkBlock_cond_Next_stub); ass.Nop(); ass.Nop(); ass.Nop(); ass.Nop(); } else { ass.Mov(Condition(CC).Negate(), r4, block->NextBlock); ass.Mov(CC, r4, block->BranchBlock); storeSh4Reg(r4, reg_nextpc); jump(no_update); } break; } case BET_DynamicRet: case BET_DynamicCall: case BET_DynamicJump: if (!mmu_enabled()) { ass.Sub(r2, r8, -rcbOffset(fpcb)); ass.Ubfx(r1, r4, 1, 24); ass.Ldr(pc, MemOperand(r2, r1, LSL, 2)); } else { storeSh4Reg(r4, reg_nextpc); jump(no_update); } break; case BET_StaticCall: case BET_StaticJump: if (!mmu_enabled()) { if (block->pBranchBlock == nullptr) call(ngen_LinkBlock_Generic_stub); else call((void *)block->pBranchBlock->code); ass.Nop(); ass.Nop(); ass.Nop(); } else { ass.Mov(r4, block->BranchBlock); storeSh4Reg(r4, reg_nextpc); jump(no_update); } break; case BET_StaticIntr: case BET_DynamicIntr: if (block->BlockType == BET_StaticIntr) ass.Mov(r4, block->NextBlock); //else -> already in r4 djump ! storeSh4Reg(r4, reg_nextpc); call((void *)UpdateINTC); loadSh4Reg(r4, reg_nextpc); jump(no_update); break; default: ERROR_LOG(DYNAREC, "Error, Relink() Block Type: %X", block->BlockType); verify(false); break; } return ass.GetCursorOffset() - start_offset; } u32 DynaRBI::Relink() { ass = Arm32Assembler((u8 *)code + relink_offset, host_code_size - relink_offset); u32 size = relinkBlock(this); ass.Finalize(); return size; } static Register GetParam(const shil_param& param, Register raddr = r0) { if (param.is_imm()) { ass.Mov(raddr, param._imm); return raddr; } if (param.is_r32i()) return reg.mapReg(param); die("Invalid parameter"); return Register(); } static void ngen_Binary(shil_opcode* op, BinaryOP dtop) { Register rs1 = GetParam(op->rs1); Register rs2 = r1; if (op->rs2.is_imm()) { if (ImmediateA32::IsImmediateA32(op->rs2._imm)) { (ass.*dtop)(reg.mapReg(op->rd), rs1, Operand(op->rs2._imm)); return; } ass.Mov(rs2, op->rs2._imm); } else if (op->rs2.is_r32i()) { rs2 = reg.mapReg(op->rs2); } else { ERROR_LOG(DYNAREC, "ngen_Bin ??? %d", op->rs2.type); verify(false); } (ass.*dtop)(reg.mapReg(op->rd), rs1, rs2); } static void ngen_fp_bin(shil_opcode* op, FPBinOP fpop) { SRegister rs1 = s0; if (op->rs1.is_imm()) { ass.Mov(r0, op->rs1._imm); ass.Vmov(rs1, r0); } else { rs1 = reg.mapFReg(op->rs1); } SRegister rs2 = s1; if (op->rs2.is_imm()) { ass.Mov(r0, op->rs2._imm); ass.Vmov(rs2, r0); } else { rs2 = reg.mapFReg(op->rs2); } (ass.*fpop)(DataType(F32), reg.mapFReg(op->rd), rs1, rs2); } static void ngen_fp_una(shil_opcode* op, FPUnOP fpop) { (ass.*fpop)(DataType(F32), reg.mapFReg(op->rd), reg.mapFReg(op->rs1)); } struct CC_PS { CanonicalParamType type; shil_param* par; }; static std::vector CC_pars; void ngen_CC_Start(shil_opcode* op) { CC_pars.clear(); } void ngen_CC_Param(shil_opcode* op,shil_param* par,CanonicalParamType tp) { switch(tp) { case CPT_f32rv: #ifdef __ARM_PCS_VFP // -mfloat-abi=hard if (reg.IsAllocg(*par)) ass.Vmov(reg.mapReg(*par), s0); else if (reg.IsAllocf(*par)) ass.Vmov(reg.mapFReg(*par), s0); break; #endif case CPT_u32rv: case CPT_u64rvL: if (reg.IsAllocg(*par)) ass.Mov(reg.mapReg(*par), r0); else if (reg.IsAllocf(*par)) ass.Vmov(reg.mapFReg(*par), r0); else die("unhandled param"); break; case CPT_u64rvH: verify(reg.IsAllocg(*par)); ass.Mov(reg.mapReg(*par), r1); break; case CPT_u32: case CPT_ptr: case CPT_f32: { CC_PS t = { tp, par }; CC_pars.push_back(t); } break; default: die("invalid tp"); break; } } void ngen_CC_Call(shil_opcode* op, void* function) { Register rd = r0; SRegister fd = s0; for (int i = CC_pars.size(); i-- > 0; ) { CC_PS& param = CC_pars[i]; if (param.type == CPT_ptr) { ass.Mov(rd, (u32)param.par->reg_ptr()); } else { if (param.par->is_reg()) { #ifdef __ARM_PCS_VFP // -mfloat-abi=hard if (param.type == CPT_f32) { if (reg.IsAllocg(*param.par)) ass.Vmov(fd, reg.mapReg(*param.par)); else if (reg.IsAllocf(*param.par)) ass.Vmov(fd, reg.mapFReg(*param.par)); else die("Must not happen!"); continue; } #endif if (reg.IsAllocg(*param.par)) ass.Mov(rd, reg.mapReg(*param.par)); else if (reg.IsAllocf(*param.par)) ass.Vmov(rd, reg.mapFReg(*param.par)); else die("Must not happen!"); } else { verify(param.par->is_imm()); ass.Mov(rd, param.par->_imm); } } rd = Register(rd.GetCode() + 1); fd = SRegister(fd.GetCode() + 1); } call(function); for (const CC_PS& ccParam : CC_pars) { const shil_param& prm = *ccParam.par; if (ccParam.type == CPT_ptr && prm.count() == 2 && reg.IsAllocf(prm) && (op->rd._reg == prm._reg || op->rd2._reg == prm._reg)) { // fsca rd param is a pointer to a 64-bit reg so reload the regs if allocated const int shRegOffs = (u8*)GetRegPtr(prm._reg) - (u8*)&p_sh4rcb->cntx - sizeof(Sh4cntx); ass.Vldr(reg.mapFReg(prm, 0), MemOperand(r8, shRegOffs)); ass.Vldr(reg.mapFReg(prm, 1), MemOperand(r8, shRegOffs + 4)); } } } void ngen_CC_Finish(shil_opcode* op) { CC_pars.clear(); } enum mem_op_type { SZ_8, SZ_16, SZ_32I, SZ_32F, SZ_64F, }; static mem_op_type memop_type(shil_opcode* op) { bool fp32 = op->rs2.is_r32f() || op->rd.is_r32f(); if (op->size == 1) return SZ_8; else if (op->size == 2) return SZ_16; else if (op->size == 4) return fp32 ? SZ_32F : SZ_32I; else if (op->size == 8) return SZ_64F; die("Unknown op"); return SZ_32I; } const u32 memop_bytes[] = { 1, 2, 4, 4, 8 }; static const void *_mem_hndl_SQ32[3][14]; static const void *_mem_hndl[2][3][14]; const void * const _mem_func[2][2] = { { (void *)addrspace::write32, (void *)addrspace::write64 }, { (void *)addrspace::read32, (void *)addrspace::read64 }, }; const struct { u32 mask; u32 key; bool read; mem_op_type optp; u32 offs; } op_table[] = { //LDRSB { 0x0E500FF0, 0x001000D0, true, SZ_8, 1 }, //LDRSH { 0x0E500FF0, 0x001000F0, true, SZ_16, 1 }, //LDR { 0x0E500010, 0x06100000, true, SZ_32I, 1 }, //VLDR.32 { 0x0F300F00, 0x0D100A00, true, SZ_32F, 2 }, //VLDR.64 { 0x0F300F00, 0x0D100B00, true, SZ_64F, 2 }, // //STRB { 0x0FF00010, 0x07C00000, false, SZ_8, 1 }, //STRH { 0x0FF00FF0, 0x018000B0, false, SZ_16, 1 }, //STR { 0x0E500010, 0x06000000, false, SZ_32I, 1 }, //VSTR.32 { 0x0F300F00, 0x0D000A00, false, SZ_32F, 2 }, //VSTR.64 { 0x0F300F00, 0x0D000B00, false, SZ_64F, 2 }, { 0, 0 }, }; union arm_mem_op { struct { u32 Ra:4; u32 pad0:8; u32 Rt:4; u32 Rn:4; u32 pad1:2; u32 D:1; u32 pad3:1; u32 pad4:4; u32 cond:4; }; u32 full; }; static void vmem_slowpath(Register raddr, Register rt, SRegister ft, DRegister fd, mem_op_type optp, bool read) { if (!raddr.Is(r0)) ass.Mov(r0, raddr); if (!read) { if (optp <= SZ_32I) ass.Mov(r1, rt); else if (optp == SZ_32F) ass.Vmov(r1, ft); else if (optp == SZ_64F) ass.Vmov(r2, r3, fd); } const void *funct = nullptr; if (optp <= SZ_32I) funct = _mem_hndl[read][optp][raddr.GetCode()]; else funct = _mem_func[read][optp - SZ_32F]; verify(funct != nullptr); call(funct); if (read) { if (optp <= SZ_32I) ass.Mov(rt, r0); else if (optp == SZ_32F) ass.Vmov(ft, r0); else if (optp == SZ_64F) ass.Vmov(fd, r0, r1); } } bool ngen_Rewrite(host_context_t &context, void *faultAddress) { u32 *regs = context.reg; arm_mem_op *ptr = (arm_mem_op *)context.pc; mem_op_type optp; u32 read; s32 offs = -1; u32 fop = ptr[0].full; for (int i = 0; op_table[i].mask; i++) { if ((fop & op_table[i].mask) == op_table[i].key) { optp = op_table[i].optp; read = op_table[i].read; offs = op_table[i].offs; } } if (offs == -1) { ERROR_LOG(DYNAREC, "%08X : invalid size", fop); die("can't decode opcode\n"); } ptr -= offs; Register raddr, rt; SRegister ft; DRegister fd; //Get used regs from opcodes .. if ((ptr[0].full & 0x0FE00070) == 0x07E00050) { //from ubfx ! raddr = Register(ptr[0].Ra); } else if ((ptr[0].full & 0x0FE00000) == 0x03C00000) { raddr = Register(ptr[0].Rn); } else { ERROR_LOG(DYNAREC, "fail raddr %08X {@%08X}:(", ptr[0].full, regs[1]); die("Invalid opcode: vmem fixup\n"); } //from mem op rt = Register(ptr[offs].Rt); ft = SRegister(ptr[offs].Rt * 2 + ptr[offs].D); fd = DRegister(ptr[offs].D * 16 + ptr[offs].Rt); //get some other relevant data u32 sh4_addr = regs[raddr.GetCode()]; u32 fault_offs = (uintptr_t)faultAddress - regs[8]; bool is_sq = (sh4_addr >> 26) == 0x38; ass = Arm32Assembler((u8 *)ptr, 12); // fault offset must always be the addr from ubfx (sanity check) // ignore last 2 bits zeroed to avoid sigbus errors verify(fault_offs == 0 || (fault_offs & ~3) == (sh4_addr & 0x1FFFFFFC)); if (is_sq && !read && optp >= SZ_32I) { if (optp >= SZ_32F) { if (!raddr.Is(r0)) ass.Mov(r0, raddr); else ass.Nop(); raddr = r0; } switch (optp) { case SZ_32I: ass.Mov(r1, rt); break; case SZ_32F: ass.Vmov(r1, ft); break; case SZ_64F: ass.Vmov(r2, r3, fd); break; default: break; } call(_mem_hndl_SQ32[optp - SZ_32I][raddr.GetCode()]); } else { //Fallback to function ! if (optp >= SZ_32F) { if (!raddr.Is(r0)) ass.Mov(r0, raddr); else ass.Nop(); } if (!read) { if (optp <= SZ_32I) ass.Mov(r1, rt); else if (optp == SZ_32F) ass.Vmov(r1, ft); else if (optp == SZ_64F) ass.Vmov(r2, r3, fd); } const void *funct = nullptr; if (offs == 1) funct = _mem_hndl[read][optp][raddr.GetCode()]; else if (optp >= SZ_32F) funct = _mem_func[read][optp - SZ_32F]; verify(funct != nullptr); call(funct); if (read) { if (optp <= SZ_32I) ass.Mov(rt, r0); else if (optp == SZ_32F) ass.Vmov(ft, r0); else if (optp == SZ_64F) ass.Vmov(fd, r0, r1); } } ass.Finalize(); context.pc = (size_t)ptr; return true; } static Register GenMemAddr(shil_opcode* op, Register raddr = r0) { if (op->rs3.is_imm()) { if (ImmediateA32::IsImmediateA32(op->rs3._imm)) { ass.Add(raddr, reg.mapReg(op->rs1), op->rs3._imm); } else { ass.Mov(r1, op->rs3._imm); ass.Add(raddr, reg.mapReg(op->rs1), r1); } } else if (op->rs3.is_r32i()) { ass.Add(raddr, reg.mapReg(op->rs1), reg.mapReg(op->rs3)); } else if (!op->rs3.is_null()) { ERROR_LOG(DYNAREC, "rs3: %08X", op->rs3.type); die("invalid rs3"); } else if (op->rs1.is_imm()) { ass.Mov(raddr, op->rs1._imm); } else { raddr = reg.mapReg(op->rs1); } return raddr; } static bool ngen_readm_immediate(RuntimeBlockInfo* block, shil_opcode* op, bool optimise) { if (!op->rs1.is_imm()) return false; void *ptr; bool isram; u32 addr; if (!rdv_readMemImmediate(op->rs1._imm, op->size, ptr, isram, addr, block)) return false; mem_op_type optp = memop_type(op); Register rd = (optp != SZ_32F && optp != SZ_64F) ? reg.mapReg(op->rd) : r0; if (isram) { if (optp == SZ_32F || optp == SZ_64F) ptr = (void *)((uintptr_t)ptr & ~3); ass.Mov(r0, (u32)ptr); switch(optp) { case SZ_8: ass.Ldrsb(rd, MemOperand(r0)); break; case SZ_16: ass.Ldrsh(rd, MemOperand(r0)); break; case SZ_32I: ass.Ldr(rd, MemOperand(r0)); break; case SZ_32F: ass.Vldr(reg.mapFReg(op->rd), MemOperand(r0)); break; case SZ_64F: if (reg.IsAllocf(op->rd)) { ass.Vldr(reg.mapFReg(op->rd, 0), MemOperand(r0)); ass.Vldr(reg.mapFReg(op->rd, 1), MemOperand(r0, 4)); } else { ass.Vldr(d0, MemOperand(r0)); ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); } break; } } else { // Not RAM if (optp == SZ_64F) { // Need to call the handler twice ass.Mov(r0, op->rs1._imm); call(ptr); if (reg.IsAllocf(op->rd)) ass.Vmov(reg.mapFReg(op->rd, 0), r0); else ass.Str(r0, MemOperand(r8, op->rd.reg_nofs())); ass.Mov(r0, op->rs1._imm + 4); call(ptr); if (reg.IsAllocf(op->rd)) ass.Vmov(reg.mapFReg(op->rd, 1), r0); else ass.Str(r0, MemOperand(r8, op->rd.reg_nofs() + 4)); } else { ass.Mov(r0, op->rs1._imm); call(ptr); switch(optp) { case SZ_8: ass.Sxtb(r0, r0); break; case SZ_16: ass.Sxth(r0, r0); break; case SZ_32I: case SZ_32F: break; default: die("Invalid size"); break; } if (reg.IsAllocg(op->rd)) ass.Mov(rd, r0); else if (reg.IsAllocf(op->rd)) ass.Vmov(reg.mapFReg(op->rd), r0); else die("Unsupported"); } } return true; } static bool ngen_writemem_immediate(RuntimeBlockInfo* block, shil_opcode* op, bool optimise) { if (!op->rs1.is_imm()) return false; void *ptr; bool isram; u32 addr; if (!rdv_writeMemImmediate(op->rs1._imm, op->size, ptr, isram, addr, block)) return false; mem_op_type optp = memop_type(op); Register rs2 = r1; SRegister rs2f = s0; if (op->rs2.is_imm()) ass.Mov(rs2, op->rs2._imm); else if (optp == SZ_32F) rs2f = reg.mapFReg(op->rs2); else if (optp != SZ_64F) rs2 = reg.mapReg(op->rs2); if (isram) { if (optp == SZ_32F || optp == SZ_64F) ptr = (void *)((uintptr_t)ptr & ~3); ass.Mov(r0, (u32)ptr); switch(optp) { case SZ_8: ass.Strb(rs2, MemOperand(r0)); break; case SZ_16: ass.Strh(rs2, MemOperand(r0)); break; case SZ_32I: ass.Str(rs2, MemOperand(r0)); break; case SZ_32F: ass.Vstr(rs2f, MemOperand(r0)); break; case SZ_64F: if (reg.IsAllocf(op->rs2)) { ass.Vstr(reg.mapFReg(op->rs2, 0), MemOperand(r0)); ass.Vstr(reg.mapFReg(op->rs2, 1), MemOperand(r0, 4)); } else { ass.Vldr(d0, MemOperand(r8, op->rs2.reg_nofs())); ass.Vstr(d0, MemOperand(r0)); } break; default: die("Invalid size"); break; } } else { if (optp == SZ_64F) die("SZ_64F not supported"); ass.Mov(r0, op->rs1._imm); if (optp == SZ_32F) ass.Vmov(r1, rs2f); else if (!rs2.Is(r1)) ass.Mov(r1, rs2); call(ptr); } return true; } static void genMmuLookup(RuntimeBlockInfo* block, const shil_opcode& op, u32 write, Register& raddr) { if (mmu_enabled()) { Label inCache; Label done; ass.Lsr(r1, raddr, 12); ass.Ldr(r1, MemOperand(r9, r1, LSL, 2)); ass.Cmp(r1, 0); ass.B(ne, &inCache); if (!raddr.Is(r0)) ass.Mov(r0, raddr); ass.Mov(r1, write); ass.Mov(r2, block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0)); // pc call((void *)mmuDynarecLookup); ass.B(&done); ass.Bind(&inCache); ass.And(r0, raddr, 0xFFF); ass.Orr(r0, r0, r1); ass.Bind(&done); raddr = r0; } } static void interpreter_fallback(u16 op, OpCallFP *oph, u32 pc) { try { oph(op); } catch (SH4ThrownException& ex) { if (pc & 1) { // Delay slot AdjustDelaySlotException(ex); pc--; } Do_Exception(pc, ex.expEvn); handleException(); } } static void do_sqw_mmu_no_ex(u32 addr, u32 pc) { try { do_sqw_mmu(addr); } catch (SH4ThrownException& ex) { if (pc & 1) { // Delay slot AdjustDelaySlotException(ex); pc--; } Do_Exception(pc, ex.expEvn); handleException(); } } static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool optimise) { switch(op->op) { case shop_readm: if (!ngen_readm_immediate(block, op, optimise)) { mem_op_type optp = memop_type(op); Register raddr = GenMemAddr(op); genMmuLookup(block, *op, 0, raddr); if (addrspace::virtmemEnabled()) { ass.Bic(r1, raddr, optp == SZ_32F || optp == SZ_64F ? 0xE0000003 : 0xE0000000); switch(optp) { case SZ_8: ass.Ldrsb(reg.mapReg(op->rd), MemOperand(r1, r8)); break; case SZ_16: ass.Ldrsh(reg.mapReg(op->rd), MemOperand(r1, r8)); break; case SZ_32I: ass.Ldr(reg.mapReg(op->rd), MemOperand(r1, r8)); break; case SZ_32F: ass.Add(r1, r1, r8); //3 opcodes, there's no [REG+REG] VLDR ass.Vldr(reg.mapFReg(op->rd), MemOperand(r1)); break; case SZ_64F: ass.Add(r1, r1, r8); //3 opcodes, there's no [REG+REG] VLDR ass.Vldr(d0, MemOperand(r1)); if (reg.IsAllocf(op->rd)) { ass.Vmov(r0, r1, d0); ass.Vmov(reg.mapFReg(op->rd, 0), r0); ass.Vmov(reg.mapFReg(op->rd, 1), r1); // easier to do just this but we need to use a different op than 32f to distinguish during rewrite //ass.Vldr(reg.mapFReg(op->rd, 0), MemOperand(r1)); //ass.Vldr(reg.mapFReg(op->rd, 1), MemOperand(r1, 4)); } else { ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); } break; } } else { switch(optp) { case SZ_8: vmem_slowpath(raddr, reg.mapReg(op->rd), s0, d0, optp, true); break; case SZ_16: vmem_slowpath(raddr, reg.mapReg(op->rd), s0, d0, optp, true); break; case SZ_32I: vmem_slowpath(raddr, reg.mapReg(op->rd), s0, d0, optp, true); break; case SZ_32F: vmem_slowpath(raddr, r0, reg.mapFReg(op->rd), d0, optp, true); break; case SZ_64F: vmem_slowpath(raddr, r0, s0, d0, optp, true); if (reg.IsAllocf(op->rd)) { ass.Vmov(r0, r1, d0); ass.Vmov(reg.mapFReg(op->rd, 0), r0); ass.Vmov(reg.mapFReg(op->rd, 1), r1); } else { ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); } break; } } } break; case shop_writem: if (!ngen_writemem_immediate(block, op, optimise)) { mem_op_type optp = memop_type(op); Register raddr = GenMemAddr(op); genMmuLookup(block, *op, 1,raddr); Register rs2 = r2; SRegister rs2f = s2; if (optp == SZ_64F) { if (reg.IsAllocf(op->rs2)) { ass.Vmov(r2, reg.mapFReg(op->rs2, 0)); ass.Vmov(r3, reg.mapFReg(op->rs2, 1)); ass.Vmov(d0, r2, r3); } else { ass.Vldr(d0, MemOperand(r8, op->rs2.reg_nofs())); } } else if (op->rs2.is_imm()) { ass.Mov(rs2, op->rs2._imm); if (optp == SZ_32F) ass.Vmov(rs2f, rs2); } else { if (optp == SZ_32F) rs2f = reg.mapFReg(op->rs2); else rs2 = reg.mapReg(op->rs2); } if (addrspace::virtmemEnabled()) { ass.Bic(r1, raddr, optp == SZ_32F || optp == SZ_64F ? 0xE0000003 : 0xE0000000); switch(optp) { case SZ_8: ass.Strb(rs2, MemOperand(r1, r8)); break; case SZ_16: ass.Strh(rs2, MemOperand(r1, r8)); break; case SZ_32I: ass.Str(rs2, MemOperand(r1, r8)); break; case SZ_32F: ass.Add(r1, r1, r8); //3 opcodes: there's no [REG+REG] VLDR, also required for SQ ass.Vstr(rs2f, MemOperand(r1)); break; case SZ_64F: ass.Add(r1, r1, r8); //3 opcodes: there's no [REG+REG] VLDR, also required for SQ ass.Vstr(d0, MemOperand(r1)); break; } } else { switch(optp) { case SZ_8: vmem_slowpath(raddr, rs2, s0, d0, optp, false); break; case SZ_16: vmem_slowpath(raddr, rs2, s0, d0, optp, false); break; case SZ_32I: vmem_slowpath(raddr, rs2, s0, d0, optp, false); break; case SZ_32F: vmem_slowpath(raddr, r0, rs2f, d0, optp, false); break; case SZ_64F: vmem_slowpath(raddr, r0, s0, d0, optp, false); break; } } } break; //dynamic jump, r+imm32.This will be at the end of the block, but doesn't -have- to be the last opcode case shop_jdyn: verify(op->rd.is_reg() && op->rd._reg == reg_pc_dyn); if (op->rs2.is_imm()) { ass.Mov(r2, op->rs2.imm_value()); ass.Add(r4, reg.mapReg(op->rs1), r2); } else { ass.Mov(r4, reg.mapReg(op->rs1)); } break; case shop_mov32: verify(op->rd.is_r32()); if (op->rs1.is_imm()) { if (op->rd.is_r32i()) { ass.Mov(reg.mapReg(op->rd), op->rs1._imm); } else { if (op->rs1._imm==0) { //VEOR(reg.mapFReg(op->rd),reg.mapFReg(op->rd),reg.mapFReg(op->rd)); //hum, vmov can't do 0, but can do all kind of weird small consts ... really useful ... //simd is slow on a9 #if 0 ass.Movw(r0, 0); ass.Vmov(reg.mapFReg(op->rd), r0); #else //1-1=0 ! //should be slightly faster ... //we could get rid of the imm mov, if not for infs & co .. ass.Vmov(reg.mapFReg(op->rd), 1.f);; ass.Vsub(reg.mapFReg(op->rd), reg.mapFReg(op->rd), reg.mapFReg(op->rd)); #endif } else if (op->rs1._imm == 0x3F800000) ass.Vmov(reg.mapFReg(op->rd), 1.f); else { ass.Mov(r0, op->rs1._imm); ass.Vmov(reg.mapFReg(op->rd), r0); } } } else if (op->rs1.is_r32()) { u32 type = 0; if (reg.IsAllocf(op->rd)) type |= 1; if (reg.IsAllocf(op->rs1)) type |= 2; switch(type) { case 0: // reg = reg ass.Mov(reg.mapReg(op->rd), reg.mapReg(op->rs1)); break; case 1: // vfp = reg ass.Vmov(reg.mapFReg(op->rd), reg.mapReg(op->rs1)); break; case 2: // reg = vfp ass.Vmov(reg.mapReg(op->rd), reg.mapFReg(op->rs1)); break; case 3: // vfp = vfp ass.Vmov(reg.mapFReg(op->rd), reg.mapFReg(op->rs1)); break; } } else { die("Invalid mov32 size"); } break; case shop_mov64: verify(op->rs1.is_r64f() && op->rd.is_r64f()); if (reg.IsAllocf(op->rd)) { verify(reg.IsAllocf(op->rs1)); SRegister rd0 = reg.mapFReg(op->rd, 0); SRegister rs0 = reg.mapFReg(op->rs1, 0); SRegister rd1 = reg.mapFReg(op->rd, 1); SRegister rs1 = reg.mapFReg(op->rs1, 1); if (rd0.Is(rs1)) { ass.Vmov(s0, rd0); ass.Vmov(rd0, rs0); ass.Vmov(rd1, s0); } else { if (!rd0.Is(rs0)) ass.Vmov(rd0, rs0); if (!rd1.Is(rs1)) ass.Vmov(rd1, rs1); } } else { ass.Vldr(d0, MemOperand(r8, op->rs1.reg_nofs())); ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); } break; case shop_jcond: verify(op->rd.is_reg() && op->rd._reg == reg_pc_dyn); ass.Mov(r4, reg.mapReg(op->rs1)); break; case shop_ifb: if (op->rs1._imm) { ass.Mov(r1, op->rs2._imm); storeSh4Reg(r1, reg_nextpc); } ass.Mov(r0, op->rs3._imm); if (!mmu_enabled()) { call((void *)OpPtr[op->rs3._imm]); } else { ass.Mov(r1, reinterpret_cast(*OpDesc[op->rs3._imm]->oph)); // op handler ass.Mov(r2, block->vaddr + op->guest_offs - (op->delay_slot ? 1 : 0)); // pc call((void *)interpreter_fallback); } break; #ifndef CANONICALTEST case shop_neg: ass.Rsb(reg.mapReg(op->rd), reg.mapReg(op->rs1), 0); break; case shop_not: ass.Mvn(reg.mapReg(op->rd), reg.mapReg(op->rs1)); break; case shop_shl: ngen_Binary(op, &MacroAssembler::Lsl); break; case shop_shr: ngen_Binary(op, &MacroAssembler::Lsr); break; case shop_sar: ngen_Binary(op, &MacroAssembler::Asr); break; case shop_and: ngen_Binary(op, &MacroAssembler::And); break; case shop_or: ngen_Binary(op, &MacroAssembler::Orr); break; case shop_xor: ngen_Binary(op, &MacroAssembler::Eor); break; case shop_add: ngen_Binary(op, &MacroAssembler::Add); break; case shop_sub: ngen_Binary(op, &MacroAssembler::Sub); break; case shop_ror: ngen_Binary(op, &MacroAssembler::Ror); break; case shop_adc: { Register rs1 = GetParam(op->rs1, r1); Register rs2 = GetParam(op->rs2, r2); Register rs3 = GetParam(op->rs3, r3); ass.Lsr(SetFlags, r0, rs3, 1); //C=rs3, r0=0 ass.Adc(SetFlags, reg.mapReg(op->rd), rs1, rs2); //(C,rd)=rs1+rs2+rs3(C) ass.Adc(reg.mapReg(op->rd2), r0, 0); //rd2=C, (or MOVCS rd2, 1) } break; case shop_rocr: { Register rd2 = reg.mapReg(op->rd2); Register rs1 = GetParam(op->rs1, r1); Register rs2 = GetParam(op->rs2, r2); if (!rd2.Is(rs1)) { ass.Lsr(SetFlags, rd2, rs2, 1); //C=rs2, rd2=0 ass.And(rd2, rs1, 1); //get new carry } else { ass.Lsr(SetFlags, r0, rs2, 1); //C=rs2, rd2=0 ass.Add(r0, rs1, 1); } ass.Rrx(reg.mapReg(op->rd), rs1); //RRX w/ carry :) if (rd2.Is(rs1)) ass.Mov(rd2, r0); } break; case shop_rocl: { Register rs1 = GetParam(op->rs1, r1); Register rs2 = GetParam(op->rs2, r2); ass.Orr(SetFlags, reg.mapReg(op->rd), rs2, Operand(rs1, LSL, 1)); //(C,rd)= rs1<<1 + (|) rs2 ass.Mov(reg.mapReg(op->rd2), 0); //clear rd2 (for ADC/MOVCS) ass.Adc(reg.mapReg(op->rd2), reg.mapReg(op->rd2), 0); //rd2=C (or MOVCS rd2, 1) } break; case shop_sbc: { Register rd2 = reg.mapReg(op->rd2); Register rs1 = GetParam(op->rs1, r1); if (rs1.Is(rd2)) { ass.Mov(r1, rs1); rs1 = r1; } Register rs2 = GetParam(op->rs2, r2); if (rs2.Is(rd2)) { ass.Mov(r2, rs2); rs2 = r2; } Register rs3 = GetParam(op->rs3, r3); ass.Eor(rd2, rs3, 1); ass.Lsr(SetFlags, rd2, rd2, 1); //C=rs3, rd2=0 ass.Sbc(SetFlags, reg.mapReg(op->rd), rs1, rs2); ass.Mov(cc, rd2, 1); } break; case shop_negc: { Register rd2 = reg.mapReg(op->rd2); Register rs1 = GetParam(op->rs1, r1); if (rs1.Is(rd2)) { ass.Mov(r1, rs1); rs1 = r1; } Register rs2 = GetParam(op->rs2, r2); ass.Eor(rd2, rs2, 1); ass.Lsr(SetFlags, rd2, rd2, 1); //C=rs3, rd2=0 ass.Sbc(SetFlags, reg.mapReg(op->rd), rd2, rs1); // rd2 == 0 ass.Mov(cc, rd2, 1); } break; case shop_shld: { verify(!op->rs2.is_imm()); ass.And(SetFlags, r0, reg.mapReg(op->rs2), 0x8000001F); ass.Rsb(mi, r0, r0, 0x80000020); Register rs1 = GetParam(op->rs1, r1); ass.Lsr(mi, reg.mapReg(op->rd), rs1, r0); ass.Lsl(pl, reg.mapReg(op->rd), rs1, r0); } break; case shop_shad: { verify(!op->rs2.is_imm()); ass.And(SetFlags, r0, reg.mapReg(op->rs2), 0x8000001F); ass.Rsb(mi, r0, r0, 0x80000020); Register rs1 = GetParam(op->rs1, r1); ass.Asr(mi, reg.mapReg(op->rd), rs1, r0); ass.Lsl(pl, reg.mapReg(op->rd), rs1, r0); } break; case shop_sync_sr: //must flush: SRS, SRT, r0-r7, r0b-r7b call((void *)UpdateSR); break; case shop_test: case shop_seteq: case shop_setge: case shop_setgt: case shop_setae: case shop_setab: { Register rd = reg.mapReg(op->rd); Register rs1 = GetParam(op->rs1, r0); Register rs2 = r1; bool is_imm = false; if (op->rs2.is_imm()) { if (!ImmediateA32::IsImmediateA32(op->rs2._imm)) ass.Mov(rs2, (u32)op->rs2._imm); else is_imm = true; } else if (op->rs2.is_r32i()) { rs2 = reg.mapReg(op->rs2); } else { ERROR_LOG(DYNAREC, "ngen_Bin ??? %d", op->rs2.type); verify(false); } if (op->op == shop_test) { if (is_imm) ass.Tst(rs1, op->rs2._imm); else ass.Tst(rs1, rs2); } else { if (is_imm) ass.Cmp(rs1, op->rs2._imm); else ass.Cmp(rs1, rs2); } static const ConditionType opcls2[] = { eq, eq, ge, gt, hs, hi }; ass.Mov(rd, 0); ass.Mov(opcls2[op->op-shop_test], rd, 1); } break; case shop_setpeq: { Register rs1 = GetParam(op->rs1, r1); Register rs2 = GetParam(op->rs2, r2); ass.Eor(r1, rs1, rs2); ass.Mov(reg.mapReg(op->rd), 0); ass.Tst(r1, 0xFF000000u); ass.Tst(ne, r1, 0x00FF0000u); ass.Tst(ne, r1, 0x0000FF00u); ass.Tst(ne, r1, 0x000000FFu); ass.Mov(eq, reg.mapReg(op->rd), 1); } break; //UXTH for zero extention and/or more mul forms (for 16 and 64 bits) case shop_mul_u16: { Register rs2 = GetParam(op->rs2, r2); ass.Uxth(r1, reg.mapReg(op->rs1)); ass.Uxth(r2, rs2); ass.Mul(reg.mapReg(op->rd), r1, r2); } break; case shop_mul_s16: { Register rs2 = GetParam(op->rs2, r2); ass.Sxth(r1, reg.mapReg(op->rs1)); ass.Sxth(r2, rs2); ass.Mul(reg.mapReg(op->rd), r1, r2); } break; case shop_mul_i32: { Register rs2 = GetParam(op->rs2, r2); //x86_opcode_class opdt[]={op_movzx16to32,op_movsx16to32,op_mov32,op_mov32,op_mov32}; //x86_opcode_class opmt[]={op_mul32,op_mul32,op_mul32,op_mul32,op_imul32}; //only the top 32 bits are different on signed vs unsigned ass.Mul(reg.mapReg(op->rd), reg.mapReg(op->rs1), rs2); } break; case shop_mul_u64: { Register rs2 = GetParam(op->rs2, r2); ass.Umull(reg.mapReg(op->rd), reg.mapReg(op->rd2), reg.mapReg(op->rs1), rs2); } break; case shop_mul_s64: { Register rs2 = GetParam(op->rs2, r2); ass.Smull(reg.mapReg(op->rd), reg.mapReg(op->rd2), reg.mapReg(op->rs1), rs2); } break; case shop_pref: { ConditionType cc = eq; if (!op->rs1.is_imm()) { ass.Lsr(r1, reg.mapReg(op->rs1), 26); ass.Mov(r0, reg.mapReg(op->rs1)); ass.Cmp(r1, 0x38); } else { // The SSA pass has already checked that the // destination is a store queue so no need to check ass.Mov(r0, op->rs1.imm_value()); cc = al; } if (mmu_enabled()) { ass.Mov(r1, block->vaddr + op->guest_offs - (op->delay_slot ? 1 : 0)); // pc call((void *)do_sqw_mmu_no_ex, cc); } else { ass.Ldr(r2, MemOperand(r8, rcbOffset(do_sqw_nommu))); ass.Sub(r1, r8, -rcbOffset(sq_buffer)); ass.Blx(cc, r2); } } break; case shop_ext_s8: case shop_ext_s16: if (op->op == shop_ext_s8) ass.Sxtb(reg.mapReg(op->rd), reg.mapReg(op->rs1)); else ass.Sxth(reg.mapReg(op->rd), reg.mapReg(op->rs1)); break; case shop_xtrct: { Register rd = reg.mapReg(op->rd); Register rs1; if (op->rs1.is_imm()) { rs1 = r1; ass.Mov(rs1, op->rs1._imm); } else { rs1 = reg.mapReg(op->rs1); } Register rs2; if (op->rs2.is_imm()) { rs2 = r2; ass.Mov(rs2, op->rs2._imm); } else { rs2 = reg.mapReg(op->rs2); } if (rd.Is(rs1)) { verify(!rd.Is(rs2)); ass.Lsr(rd, rs1, 16); ass.Lsl(r0, rs2, 16); } else { ass.Lsl(rd, rs2, 16); ass.Lsr(r0, rs1, 16); } ass.Orr(rd, rd, r0); } break; // // FPU // case shop_fadd: case shop_fsub: case shop_fmul: case shop_fdiv: { static const FPBinOP opcds[] = { &MacroAssembler::Vadd, &MacroAssembler::Vsub, &MacroAssembler::Vmul, &MacroAssembler::Vdiv }; ngen_fp_bin(op, opcds[op->op - shop_fadd]); } break; case shop_fabs: case shop_fneg: { static const FPUnOP opcds[] = { &MacroAssembler::Vabs, &MacroAssembler::Vneg }; ngen_fp_una(op, opcds[op->op - shop_fabs]); } break; case shop_fsqrt: ngen_fp_una(op, &MacroAssembler::Vsqrt); break; case shop_fmac: { SRegister rd = reg.mapFReg(op->rd); SRegister rs1 = s1; if (op->rs1.is_imm()) { ass.Mov(r0, op->rs1.imm_value()); ass.Vmov(rs1, r0); } else rs1 = reg.mapFReg(op->rs1); SRegister rs2 = s2; if (op->rs2.is_imm()) { ass.Mov(r1, op->rs2.imm_value()); ass.Vmov(rs2, r1); } else { rs2 = reg.mapFReg(op->rs2); if (rs2.Is(rd)) { ass.Vmov(s2, rs2); rs2 = s2; } } SRegister rs3 = s3; if (op->rs3.is_imm()) { ass.Mov(r2, op->rs3.imm_value()); ass.Vmov(rs3, r2); } else { rs3 = reg.mapFReg(op->rs3); if (rs3.Is(rd)) { ass.Vmov(s3, rs3); rs3 = s3; } } if (!rd.Is(rs1)) ass.Vmov(rd, rs1); ass.Vmla(rd, rs2, rs3); } break; case shop_fsrra: ass.Vmov(s1, 1.f); ass.Vsqrt(s0, reg.mapFReg(op->rs1)); ass.Vdiv(reg.mapFReg(op->rd), s1, s0); break; case shop_fsetgt: case shop_fseteq: #if 1 //this is apparently much faster (tested on A9) ass.Mov(reg.mapReg(op->rd), 0); ass.Vcmp(reg.mapFReg(op->rs1), reg.mapFReg(op->rs2)); ass.Vmrs(RegisterOrAPSR_nzcv(APSR_nzcv), FPSCR); if (op->op == shop_fsetgt) ass.Mov(gt, reg.mapReg(op->rd), 1); else ass.Mov(eq, reg.mapReg(op->rd), 1); #else if (op->op == shop_fsetgt) ass.Vcgt(d0, reg.mapFReg(op->rs1), reg.mapFReg(op->rs2)); else ass.Vceq(d0, reg.mapFReg(op->rs1), reg.mapFReg(op->rs2)); ass.Vmov(r0, s0); ass.And(reg.mapReg(op->rd), r0, 1); #endif break; case shop_fsca: //r1: base ptr ass.Mov(r1, (u32)sin_table & 0xFFFF); if (op->rs1.is_imm()) ass.Mov(r0, op->rs1._imm & 0xFFFF); else ass.Uxth(r0, reg.mapReg(op->rs1)); ass.Movt(r1, (u32)sin_table >> 16); ass.Add(r0, r1, Operand(r0, LSL, 3)); if (reg.IsAllocf(op->rd)) { ass.Vldr(reg.mapFReg(op->rd, 0), MemOperand(r0)); ass.Vldr(reg.mapFReg(op->rd, 1), MemOperand(r0, 4)); } else { ass.Vldr(d0, MemOperand(r0)); ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); } break; case shop_fipr: { QRegister _r1 = q0; QRegister _r2 = q0; ass.Sub(r0, r8, op->rs1.reg_aofs()); if (op->rs2.reg_aofs() == op->rs1.reg_aofs()) { ass.Vldm(r0, NO_WRITE_BACK, DRegisterList(d0, 2)); } else { ass.Sub(r1, r8, op->rs2.reg_aofs()); ass.Vldm(r0, NO_WRITE_BACK, DRegisterList(d0, 2)); ass.Vldm(r1, NO_WRITE_BACK, DRegisterList(d2, 2)); _r2 = q1; } #if 1 //VFP SRegister fs2 = _r2.Is(q0) ? s0 : s4; ass.Vmul(reg.mapFReg(op->rd), s0, fs2); ass.Vmla(reg.mapFReg(op->rd), s1, SRegister(fs2.GetCode() + 1)); ass.Vmla(reg.mapFReg(op->rd), s2, SRegister(fs2.GetCode() + 2)); ass.Vmla(reg.mapFReg(op->rd), s3, SRegister(fs2.GetCode() + 3)); #else ass.Vmul(q0, _r1, _r2); ass.Vpadd(d0, d0, d1); ass.Vadd(reg.mapFReg(op->rd), f0, f1); #endif } break; case shop_ftrv: { Register rdp = r1; ass.Sub(r2, r8, op->rs2.reg_aofs()); ass.Sub(r1, r8, op->rs1.reg_aofs()); if (op->rs1.reg_aofs() != op->rd.reg_aofs()) { rdp = r0; ass.Sub(r0, r8, op->rd.reg_aofs()); } #if 1 //f0,f1,f2,f3 : vin //f4,f5,f6,f7 : out //f8,f9,f10,f11 : mtx temp //f12,f13,f14,f15 : mtx temp //(This is actually faster than using neon) ass.Vldm(r2, WRITE_BACK, DRegisterList(d4, 2)); ass.Vldm(r1, NO_WRITE_BACK, DRegisterList(d0, 2)); ass.Vmul(s4, vixl::aarch32::s8, s0); ass.Vmul(s5, s9, s0); ass.Vmul(s6, s10, s0); ass.Vmul(s7, s11, s0); ass.Vldm(r2, WRITE_BACK, DRegisterList(d6, 2)); ass.Vmla(s4, s12, s1); ass.Vmla(s5, s13, s1); ass.Vmla(s6, s14, s1); ass.Vmla(s7, s15, s1); ass.Vldm(r2, WRITE_BACK, DRegisterList(d4, 2)); ass.Vmla(s4, vixl::aarch32::s8, s2); ass.Vmla(s5, s9, s2); ass.Vmla(s6, s10, s2); ass.Vmla(s7, s11, s2); ass.Vldm(r2, NO_WRITE_BACK, DRegisterList(d6, 2)); ass.Vmla(s4, s12, s3); ass.Vmla(s5, s13, s3); ass.Vmla(s6, s14, s3); ass.Vmla(s7, s15, s3); ass.Vstm(rdp, NO_WRITE_BACK, DRegisterList(d2, 2)); #else //this fits really nicely to NEON ! // TODO ass.Vldm(d16,r2,8); ass.Vldm(d0,r1,2); ass.Vmla(q2,q8,d0,0); ass.Vmla(q2,q9,d0,1); ass.Vmla(q2,q10,d1,0); ass.Vmla(q2,q11,d1,1); ass.Vstm(d4,rdp,2); #endif } break; case shop_frswap: ass.Sub(r0, r8, op->rs1.reg_aofs()); ass.Sub(r1, r8, op->rd.reg_aofs()); //Assumes no FPU reg alloc here //frswap touches all FPU regs, so all spans should be clear here .. ass.Vldm(r1, NO_WRITE_BACK, DRegisterList(d0, 8)); ass.Vldm(r0, NO_WRITE_BACK, DRegisterList(d8, 8)); ass.Vstm(r0, NO_WRITE_BACK, DRegisterList(d0, 8)); ass.Vstm(r1, NO_WRITE_BACK, DRegisterList(d8, 8)); break; case shop_cvt_f2i_t: ass.Vcvt(S32, F32, s0, reg.mapFReg(op->rs1)); ass.Vmov(reg.mapReg(op->rd), s0); break; case shop_cvt_i2f_n: // may be some difference should be made ? case shop_cvt_i2f_z: ass.Vmov(s0, reg.mapReg(op->rs1)); ass.Vcvt(F32, S32, reg.mapFReg(op->rd), s0); break; #endif default: shil_chf[op->op](op); break; } } void ngen_Compile(RuntimeBlockInfo* block, bool force_checks, bool reset, bool staging, bool optimise) { ass = Arm32Assembler((u8 *)emit_GetCCPtr(), emit_FreeSpace()); block->code = (DynarecCodeEntryPtr)emit_GetCCPtr(); //reg alloc reg.DoAlloc(block, mmu_enabled() ? alloc_regs_mmu : alloc_regs, alloc_fpu); u8* blk_start = ass.GetCursorAddress(); //pre-load the first reg alloc operations, for better efficiency .. if (!block->oplist.empty()) reg.OpBegin(&block->oplist[0], 0); // block checks if (mmu_enabled()) { ass.Mov(r0, block->vaddr); ass.Mov(r1, block->addr); if (block->has_fpu_op) call((void *)checkBlockFpu); else call((void *)checkBlockNoFpu); } if (force_checks) { u32 addr = block->addr; ass.Mov(r0, addr); s32 sz = block->sh4_code_size; while (sz > 0) { if (sz > 2) { u32* ptr = (u32*)GetMemPtr(addr, 4); if (ptr != nullptr) { ass.Mov(r2, (u32)ptr); ass.Ldr(r2, MemOperand(r2)); ass.Mov(r1, *ptr); ass.Cmp(r1, r2); jump(ngen_blockcheckfail, ne); } addr += 4; sz -= 4; } else { u16* ptr = (u16 *)GetMemPtr(addr, 2); if (ptr != nullptr) { ass.Mov(r2, (u32)ptr); ass.Ldrh(r2, MemOperand(r2)); ass.Mov(r1, *ptr); ass.Cmp(r1, r2); jump(ngen_blockcheckfail, ne); } addr += 2; sz -= 2; } } } //scheduler ass.Ldr(r1, MemOperand(r8, rcbOffset(cntx.cycle_counter))); ass.Cmp(r1, 0); Label cyclesRemaining; ass.B(pl, &cyclesRemaining); ass.Mov(r0, block->vaddr); call(intc_sched); ass.Mov(r1, r0); ass.Bind(&cyclesRemaining); const u32 cycles = block->guest_cycles; if (!ImmediateA32::IsImmediateA32(cycles)) { ass.Sub(r1, r1, cycles & ~3); ass.Sub(r1, r1, cycles & 3); } else { ass.Sub(r1, r1, cycles); } ass.Str(r1, MemOperand(r8, rcbOffset(cntx.cycle_counter))); //compile the block's opcodes shil_opcode* op; for (size_t i = 0; i < block->oplist.size(); i++) { op = &block->oplist[i]; op->host_offs = ass.GetCursorOffset(); if (i != 0) reg.OpBegin(op, i); ngen_compile_opcode(block, op, optimise); reg.OpEnd(op); } if (block->BlockType == BET_Cond_0 || block->BlockType == BET_Cond_1) { // Store the arm reg containing sr.T in the block // This will be used when the block in (re)linked const shil_param param = shil_param(reg_sr_T); if (reg.IsAllocg(param)) ((DynaRBI *)block)->T_reg = reg.mapReg(param); else ((DynaRBI *)block)->T_reg = Register(); } reg.Cleanup(); //Relink written bytes must be added to the count ! block->relink_offset = ass.GetCursorOffset(); block->relink_data = 0; relinkBlock((DynaRBI *)block); ass.Finalize(); emit_Skip(ass.GetCursorOffset()); u8* pEnd = ass.GetCursorAddress(); //blk_start might not be the same, due to profiling counters .. block->host_opcodes = (pEnd - blk_start) / 4; //host code size needs to cover the entire range of the block block->host_code_size = pEnd - (u8*)block->code; } void ngen_ResetBlocks() { INFO_LOG(DYNAREC, "ngen_ResetBlocks()"); mainloop = nullptr; unwinder.clear(); if (p_sh4rcb->cntx.CpuRunning) { // Force the dynarec out of mainloop() to regenerate it p_sh4rcb->cntx.CpuRunning = 0; restarting = true; } else generate_mainloop(); } static void generate_mainloop() { if (mainloop != nullptr) return; INFO_LOG(DYNAREC, "Generating main loop"); ass = Arm32Assembler((u8 *)emit_GetCCPtr(), emit_FreeSpace()); unwinder.start(ass.GetCursorAddress()); // Stubs Label ngen_LinkBlock_Shared_stub; // ngen_LinkBlock_Generic_stub ngen_LinkBlock_Generic_stub = ass.GetCursorAddress(); ass.Mov(r1,r4); // djump/pc -> in case we need it .. ass.B(&ngen_LinkBlock_Shared_stub); // ngen_LinkBlock_cond_Branch_stub ngen_LinkBlock_cond_Branch_stub = ass.GetCursorAddress(); ass.Mov(r1, 1); ass.B(&ngen_LinkBlock_Shared_stub); // ngen_LinkBlock_cond_Next_stub ngen_LinkBlock_cond_Next_stub = ass.GetCursorAddress(); ass.Mov(r1, 0); ass.B(&ngen_LinkBlock_Shared_stub); // ngen_LinkBlock_Shared_stub ass.Bind(&ngen_LinkBlock_Shared_stub); ass.Mov(r0, lr); ass.Sub(r0, r0, 4); // go before the call call((void *)rdv_LinkBlock); ass.Bx(r0); // ngen_FailedToFindBlock_ ngen_FailedToFindBlock_ = ass.GetCursorAddress(); if (mmu_enabled()) { call((void *)rdv_FailedToFindBlock_pc); } else { ass.Mov(r0, r4); call((void *)rdv_FailedToFindBlock); } ass.Bx(r0); // ngen_blockcheckfail ngen_blockcheckfail = ass.GetCursorAddress(); call((void *)rdv_BlockCheckFail); if (mmu_enabled()) { Label jumpblockLabel; ass.Cmp(r0, 0); ass.B(ne, &jumpblockLabel); loadSh4Reg(r0, reg_nextpc); call((void *)bm_GetCodeByVAddr); ass.Bind(&jumpblockLabel); } ass.Bx(r0); // Main loop Label no_updateLabel; // ngen_mainloop: mainloop = ass.GetCursorAddress(); RegisterList savedRegisters = RegisterList::Union( RegisterList(r4, r5, r6, r7), RegisterList(r8, r9, r10, r11), RegisterList(r12, lr)); { UseScratchRegisterScope scope(&ass); scope.ExcludeAll(); ass.Push(savedRegisters); } unwinder.allocStack(0, 40); unwinder.saveReg(0, r4, 40); unwinder.saveReg(0, r5, 36); unwinder.saveReg(0, r6, 32); unwinder.saveReg(0, r7, 28); unwinder.saveReg(0, r8, 24); unwinder.saveReg(0, r9, 20); unwinder.saveReg(0, r10, 16); unwinder.saveReg(0, r11, 12); unwinder.saveReg(0, r12, 8); unwinder.saveReg(0, lr, 4); Label longjumpLabel; if (!mmu_enabled()) { // r8: context ass.Mov(r8, r0); } else { ass.Sub(sp, sp, 4); unwinder.allocStack(0, 8); ass.Push(r0); // push context unwinder.saveReg(0, r4, 4); ass.Mov(r0, reinterpret_cast(&jmp_stack)); ass.Mov(r1, sp); ass.Str(r1, MemOperand(r0)); ass.Bind(&longjumpLabel); ass.Ldr(r8, MemOperand(sp)); // r8: context ass.Mov(r9, (uintptr_t)mmuAddressLUT); // r9: mmu LUT } ass.Ldr(r4, MemOperand(r8, rcbOffset(cntx.pc))); // r4: pc ass.B(&no_updateLabel); // Go to mainloop ! // this code is here for fall-through behavior of do_iter Label do_iter; Label cleanup; // intc_sched: r0 is pc, r1 is cycle_counter intc_sched = ass.GetCursorAddress(); ass.Add(r1, r1, SH4_TIMESLICE); ass.Str(r1, MemOperand(r8, rcbOffset(cntx.cycle_counter))); ass.Str(r0, MemOperand(r8, rcbOffset(cntx.pc))); ass.Ldr(r0, MemOperand(r8, rcbOffset(cntx.CpuRunning))); ass.Cmp(r0, 0); ass.B(eq, &cleanup); ass.Mov(r4, lr); call((void *)UpdateSystem_INTC); ass.Cmp(r0, 0); ass.B(ne, &do_iter); ass.Mov(lr, r4); ass.Ldr(r0, MemOperand(r8, rcbOffset(cntx.cycle_counter))); ass.Bx(lr); // do_iter: ass.Bind(&do_iter); ass.Ldr(r4, MemOperand(r8, rcbOffset(cntx.pc))); // no_update: no_update = ass.GetCursorAddress(); ass.Bind(&no_updateLabel); // next_pc _MUST_ be on r4 ass.Ldr(r0, MemOperand(r8, rcbOffset(cntx.CpuRunning))); ass.Cmp(r0, 0); ass.B(eq, &cleanup); if (!mmu_enabled()) { ass.Sub(r2, r8, -rcbOffset(fpcb)); ass.Ubfx(r1, r4, 1, 24); // 24+1 bits: 32 MB // RAM wraps around so if actual RAM size is 16MB, we won't overflow ass.Ldr(pc, MemOperand(r2, r1, LSL, 2)); } else { ass.Mov(r0, r4); call((void *)bm_GetCodeByVAddr); ass.Bx(r0); } // cleanup: ass.Bind(&cleanup); if (mmu_enabled()) ass.Add(sp, sp, 8); // pop context & alignment { UseScratchRegisterScope scope(&ass); scope.ExcludeAll(); ass.Pop(savedRegisters); } ass.Bx(lr); // Exception handler handleException = ass.GetCursorAddress(); if (mmu_enabled()) { ass.Mov(r0, reinterpret_cast(&jmp_stack)); ass.Ldr(r1, MemOperand(r0)); ass.Mov(sp, r1); ass.B(&longjumpLabel); } // MMU Check block (with fpu) // r0: vaddr, r1: addr checkBlockFpu = ass.GetCursorAddress(); Label fpu_enabled; loadSh4Reg(r2, reg_sr_status); ass.Tst(r2, 1 << 15); // test SR.FD bit ass.B(eq, &fpu_enabled); ass.Mov(r1, Sh4Ex_FpuDisabled); // exception code call((void *)Do_Exception); loadSh4Reg(r4, reg_nextpc); ass.B(&no_updateLabel); ass.Bind(&fpu_enabled); // fallthrough // MMU Check block (no fpu) // r0: vaddr, r1: addr checkBlockNoFpu = ass.GetCursorAddress(); loadSh4Reg(r2, reg_nextpc); ass.Cmp(r2, r0); ass.Mov(r0, r1); jump(ngen_blockcheckfail, ne); ass.Bx(lr); // Memory handlers for (int s=0;s<6;s++) { const void* fn=s==0?(void*)addrspace::read8SX32: s==1?(void*)addrspace::read16SX32: s==2?(void*)addrspace::read32: s==3?(void*)addrspace::write8: s==4?(void*)addrspace::write16: s==5?(void*)addrspace::write32: 0; bool read=s<=2; //r0 to r13 for (int i=0;i<=13;i++) { if (i==1 || i ==2 || i == 3 || i == 4 || i==12 || i==13) continue; const void *v; if (i == 0) v = fn; else { v = ass.GetCursorAddress(); ass.Mov(r0, Register(i)); jump(fn); } _mem_hndl[read][s % 3][i] = v; } } for (int optp = SZ_32I; optp <= SZ_64F; optp++) { //r0 to r13 for (int reg = 0; reg <= 13; reg++) { if (reg == 1 || reg == 2 || reg == 3 || reg == 4 || reg == 12 || reg == 13) continue; if (optp != SZ_32I && reg != 0) continue; _mem_hndl_SQ32[optp - SZ_32I][reg] = ass.GetCursorAddress(); if (optp == SZ_64F) { ass.Lsr(r1, r0, 26); ass.Cmp(r1, 0x38); ass.And(r1, r0, 0x3F); ass.Add(r1, r1, r8); jump((void *)&addrspace::write64, ne); ass.Strd(r2, r3, MemOperand(r1, rcbOffset(sq_buffer))); } else { ass.And(r3, Register(reg), 0x3F); ass.Lsr(r2, Register(reg), 26); ass.Add(r3, r3, r8); ass.Cmp(r2, 0x38); if (reg != 0) ass.Mov(ne, r0, Register(reg)); jump((void *)&addrspace::write32, ne); ass.Str(r1, MemOperand(r3, rcbOffset(sq_buffer))); } ass.Bx(lr); } } ass.Finalize(); emit_Skip(ass.GetBuffer()->GetSizeInBytes()); size_t unwindSize = unwinder.end(CODE_SIZE - 128); verify(unwindSize <= 128); ngen_FailedToFindBlock = ngen_FailedToFindBlock_; INFO_LOG(DYNAREC, "readm helpers: up to %p", ass.GetCursorAddress()); } void ngen_init() { INFO_LOG(DYNAREC, "Initializing the ARM32 dynarec"); ccmap[shop_test] = eq; ccnmap[shop_test] = ne; ccmap[shop_seteq] = eq; ccnmap[shop_seteq] = ne; ccmap[shop_setge] = ge; ccnmap[shop_setge] = lt; ccmap[shop_setgt] = gt; ccnmap[shop_setgt] = le; ccmap[shop_setae] = hs; ccnmap[shop_setae] = lo; ccmap[shop_setab] = hi; ccnmap[shop_setab] = ls; } void ngen_HandleException(host_context_t &context) { context.pc = (uintptr_t)handleException; } RuntimeBlockInfo* ngen_AllocateBlock() { generate_mainloop(); // FIXME why is this needed? return new DynaRBI(); }; #endif