From 62085539a77400903b64eb768179d17a044d1dec Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Fri, 23 Dec 2022 16:06:54 +0100 Subject: [PATCH] dynarec: reg alloc 64-bit regs. avoid some interpreter fallbacks Option to reg alloc 64-bit regs in two host regs. Used when FPSCR.SZ == 1 (64-bit reg and memory transfers.) Enabled for arm, arm64 and x64 (windows only) dynarecs. Don't fallback to interpreter when FPSCR.PR==1 (double precision) for FMOV, FLDS and FLTS. --- core/hw/sh4/dyna/decoder.cpp | 37 ++++---- core/hw/sh4/dyna/shil.h | 93 +++++++++----------- core/hw/sh4/dyna/ssa.cpp | 8 +- core/hw/sh4/dyna/ssa.h | 13 ++- core/hw/sh4/dyna/ssa_regalloc.h | 96 +++++++++++++-------- core/hw/sh4/sh4_mmr.cpp | 1 + core/rec-ARM/rec_arm.cpp | 130 +++++++++++++++++++++------- core/rec-ARM64/arm64_regalloc.h | 8 +- core/rec-ARM64/rec_arm64.cpp | 147 +++++++++++++++++++++----------- core/rec-cpp/rec_cpp.cpp | 4 +- core/rec-x64/rec_x64.cpp | 81 ++++++++++++------ core/rec-x64/x64_regalloc.h | 9 +- core/rec-x64/xbyak_base.h | 97 ++++++++++++--------- core/rec-x86/rec_x86.cpp | 22 ++--- core/rec-x86/x86_ops.cpp | 18 ++-- core/rec-x86/x86_regalloc.h | 8 +- 16 files changed, 474 insertions(+), 298 deletions(-) diff --git a/core/hw/sh4/dyna/decoder.cpp b/core/hw/sh4/dyna/decoder.cpp index e5ef21470..66a4e458c 100644 --- a/core/hw/sh4/dyna/decoder.cpp +++ b/core/hw/sh4/dyna/decoder.cpp @@ -48,7 +48,7 @@ static const char idle_hash[] = static inline shil_param mk_imm(u32 immv) { - return shil_param(FMT_IMM,immv); + return shil_param(immv); } static inline shil_param mk_reg(Sh4RegType reg) @@ -63,17 +63,18 @@ static inline shil_param mk_regi(int reg) static state_t state; -static void Emit(shilop op,shil_param rd=shil_param(),shil_param rs1=shil_param(),shil_param rs2=shil_param(),u32 flags=0,shil_param rs3=shil_param(),shil_param rd2=shil_param()) +static void Emit(shilop op, shil_param rd = shil_param(), shil_param rs1 = shil_param(), shil_param rs2 = shil_param(), + u32 size = 0, shil_param rs3 = shil_param(), shil_param rd2 = shil_param()) { shil_opcode sp; - - sp.flags=flags; - sp.op=op; - sp.rd=(rd); - sp.rd2=(rd2); - sp.rs1=(rs1); - sp.rs2=(rs2); - sp.rs3=(rs3); + + sp.size = size; + sp.op = op; + sp.rd = rd; + sp.rd2 = rd2; + sp.rs1 = rs1; + sp.rs2 = rs2; + sp.rs3 = rs3; sp.guest_offs = state.cpu.rpc - blk->vaddr; sp.delay_slot = state.cpu.is_delayslot; @@ -83,12 +84,12 @@ static void Emit(shilop op,shil_param rd=shil_param(),shil_param rs1=shil_param( static void dec_fallback(u32 op) { shil_opcode opcd; - opcd.op=shop_ifb; + opcd.op = shop_ifb; - opcd.rs1=shil_param(FMT_IMM,OpDesc[op]->NeedPC()); + opcd.rs1 = shil_param(OpDesc[op]->NeedPC()); - opcd.rs2=shil_param(FMT_IMM,state.cpu.rpc+2); - opcd.rs3=shil_param(FMT_IMM,op); + opcd.rs2 = shil_param(state.cpu.rpc + 2); + opcd.rs3 = shil_param(op); opcd.guest_offs = state.cpu.rpc - blk->vaddr; opcd.delay_slot = state.cpu.is_delayslot; @@ -671,9 +672,13 @@ static bool dec_generic(u32 op) if (op>=0xF000) { state.info.has_fpu=true; - if (state.cpu.FPR64) + if (state.cpu.FPR64) { // fallback to interpreter for double float ops - return false; + // except fmov, flds and fsts that don't depend on PR + if (((op & 0xf) < 6 || (op & 0xf) > 0xc) // fmov + && (op & 0xef) != 0x0d) // flds, flts + return false; + } if (state.cpu.FSZ64 && (d==PRM_FRN_SZ || d==PRM_FRM_SZ || s==PRM_FRN_SZ || s==PRM_FRM_SZ)) transfer_64 = true; diff --git a/core/hw/sh4/dyna/shil.h b/core/hw/sh4/dyna/shil.h index 6e5aaabf5..9e2112bbf 100644 --- a/core/hw/sh4/dyna/shil.h +++ b/core/hw/sh4/dyna/shil.h @@ -7,23 +7,19 @@ extern shil_chfp* shil_chf[]; enum shil_param_type { - //2 bits FMT_NULL, FMT_IMM, FMT_I32, FMT_F32, FMT_F64, - FMT_V2, - FMT_V3, FMT_V4, - FMT_V8, FMT_V16, - FMT_REG_BASE=FMT_I32, - FMT_VECTOR_BASE=FMT_V2, + FMT_REG_BASE = FMT_I32, + FMT_VECTOR_BASE = FMT_V4, - FMT_MASK=0xFFFF, + FMT_MASK = 0xFFFF, }; /* @@ -39,56 +35,54 @@ struct shil_param { shil_param() { - type=FMT_NULL; - _imm=0xFFFFFFFF; + type = FMT_NULL; + _imm = 0xFFFFFFFF; memset(version, 0, sizeof(version)); } - shil_param(u32 type,u32 imm) + + shil_param(u32 imm) { - this->type=type; - if (type >= FMT_REG_BASE) - new (this) shil_param((Sh4RegType)imm); - _imm=imm; + this->type = FMT_IMM; + _imm = imm; memset(version, 0, sizeof(version)); } shil_param(Sh4RegType reg) { - type=FMT_NULL; - if (reg>=reg_fr_0 && reg<=reg_xf_15) + if (reg >= reg_fr_0 && reg <= reg_xf_15) { - type=FMT_F32; - _imm=reg; + type = FMT_F32; + _imm = reg; } - else if (reg>=regv_dr_0 && reg<=regv_dr_14) + else if (reg >= regv_dr_0 && reg <= regv_dr_14) { - type=FMT_F64; - _imm=(reg-regv_dr_0)*2+reg_fr_0; + type = FMT_F64; + _imm = (reg - regv_dr_0) * 2 + reg_fr_0; } - else if (reg>=regv_xd_0 && reg<=regv_xd_14) + else if (reg >= regv_xd_0 && reg <= regv_xd_14) { - type=FMT_F64; - _imm=(reg-regv_xd_0)*2+reg_xf_0; + type = FMT_F64; + _imm = (reg - regv_xd_0) * 2 + reg_xf_0; } - else if (reg>=regv_fv_0 && reg<=regv_fv_12) + else if (reg >= regv_fv_0 && reg <= regv_fv_12) { - type=FMT_V4; - _imm=(reg-regv_fv_0)*4+reg_fr_0; + type = FMT_V4; + _imm = (reg - regv_fv_0) * 4 + reg_fr_0; } - else if (reg==regv_xmtrx) + else if (reg == regv_xmtrx) { - type=FMT_V16; - _imm=reg_xf_0; + type = FMT_V16; + _imm = reg_xf_0; } - else if (reg==regv_fmtrx) + else if (reg == regv_fmtrx) { - type=FMT_V16; - _imm=reg_fr_0; + type = FMT_V16; + _imm = reg_fr_0; } else { - type=FMT_I32; - _reg=reg; + type = FMT_I32; + _reg = reg; } memset(version, 0, sizeof(version)); } @@ -106,25 +100,22 @@ struct shil_param bool is_r32i() const { return type==FMT_I32; } bool is_r32f() const { return type==FMT_F32; } - u32 is_r32fv() const { return type>=FMT_VECTOR_BASE?count():0; } + u32 is_r32fv() const { return type >= FMT_VECTOR_BASE ? count() : 0; } bool is_r64f() const { return type==FMT_F64; } bool is_r32() const { return is_r32i() || is_r32f(); } - bool is_r64() const { return is_r64f(); } //just here for symmetry ... bool is_imm_s8() const { return is_imm() && (int8_t)_imm == (int32_t)_imm; } - u32* reg_ptr() const { verify(is_reg()); return GetRegPtr(_reg); } - s32 reg_nofs() const { verify(is_reg()); return (s32)((u8*)GetRegPtr(_reg) - (u8*)GetRegPtr(reg_xf_0)-sizeof(Sh4cntx)); } - u32 reg_aofs() const { return -reg_nofs(); } + u32* reg_ptr() const { verify(is_reg()); return GetRegPtr(_reg); } + s32 reg_nofs() const { verify(is_reg()); return (s32)((u8*)GetRegPtr(_reg) - (u8*)GetRegPtr(reg_xf_0)-sizeof(Sh4cntx)); } + u32 reg_aofs() const { return -reg_nofs(); } u32 imm_value() const { verify(is_imm()); return _imm; } - bool is_vector() const { return type>=FMT_VECTOR_BASE; } - - u32 count() const { return type==FMT_F64?2:type==FMT_V2?2: - type==FMT_V3?3:type==FMT_V4?4:type==FMT_V8?8: - type==FMT_V16?16:1; } //count of hardware regs + u32 count() const { return type == FMT_F64 ? 2 : + type == FMT_V4 ? 4 : + type == FMT_V16 ? 16 : 1; } //count of hardware regs /* Imms: @@ -134,20 +125,18 @@ struct shil_param integer regs : is_r32i,is_r32,count=1 fpu regs, single view : is_r32f,is_r32,count=1 fpu regs, double view : is_r64f,count=2 - fpu regs, quad view : is_vector,is_r32fv=4, count=4 - fpu regs, matrix view : is_vector,is_r32fv=16, count=16 + fpu regs, quad view : is_r32fv=4, count=4 + fpu regs, matrix view : is_r32fv=16, count=16 */ }; struct shil_opcode { shilop op; - u32 Flow; - u32 flags; - u32 flags2; + u32 size; // memory access size - shil_param rd,rd2; - shil_param rs1,rs2,rs3; + shil_param rd, rd2; + shil_param rs1, rs2, rs3; u16 host_offs; u16 guest_offs; diff --git a/core/hw/sh4/dyna/ssa.cpp b/core/hw/sh4/dyna/ssa.cpp index 04624c980..66e4e911c 100644 --- a/core/hw/sh4/dyna/ssa.cpp +++ b/core/hw/sh4/dyna/ssa.cpp @@ -86,7 +86,7 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op) shil_param op2_rd = shil_param(op->rd2._reg); op2_rd.version[0] = op->rd2.version[0]; - InsertMov32Op(op2_rd, shil_param(FMT_IMM, rd2)); + InsertMov32Op(op2_rd, shil_param(rd2)); // the previous insert might have invalidated our reference op = &block->oplist[opnum - 1]; @@ -151,7 +151,7 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op) shil_param op2_rd = shil_param(op->rd2._reg); op2_rd.version[0] = op->rd2.version[0]; - InsertMov32Op(op2_rd, shil_param(FMT_IMM, rd2)); + InsertMov32Op(op2_rd, shil_param(rd2)); // the previous insert might have invalidated our reference op = &block->oplist[opnum - 1]; @@ -201,7 +201,7 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op) shil_param op2_rd = shil_param((Sh4RegType)(op->rd._reg + 1)); op2_rd.version[0] = op->rd.version[1]; - InsertMov32Op(op2_rd, shil_param(FMT_IMM, res >> 32)); + InsertMov32Op(op2_rd, shil_param(res >> 32)); // the previous insert might have invalidated our reference op = &block->oplist[opnum - 1]; @@ -328,7 +328,7 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op) shil_param op2_rd = shil_param((Sh4RegType)(op->rd._reg + 1)); op2_rd.version[0] = op->rd.version[1]; - InsertMov32Op(op2_rd, shil_param(FMT_IMM, rd_1)); + InsertMov32Op(op2_rd, shil_param(rd_1)); // the previous insert might have invalidated our reference op = &block->oplist[opnum - 1]; diff --git a/core/hw/sh4/dyna/ssa.h b/core/hw/sh4/dyna/ssa.h index 6a6ddb097..b8ecc1c70 100644 --- a/core/hw/sh4/dyna/ssa.h +++ b/core/hw/sh4/dyna/ssa.h @@ -21,7 +21,6 @@ #include #include #include -#include #include #include "types.h" #include "decoder.h" @@ -124,7 +123,7 @@ private: { verify(op.rd2.is_null()); op.op = shop_mov32; - op.rs1 = shil_param(FMT_IMM, v); + op.rs1 = shil_param(v); op.rs2.type = FMT_NULL; op.rs3.type = FMT_NULL; stats.constant_ops_replaced++; @@ -235,7 +234,7 @@ private: if (op.rs1.is_imm() && op.op == shop_readm && block->read_only && (op.rs1._imm >> 12) >= (block->vaddr >> 12) && (op.rs1._imm >> 12) <= ((block->vaddr + block->sh4_code_size - 1) >> 12) - && (op.flags & 0x7f) <= 4) + && op.size <= 4) { bool doit = false; if (mmu_enabled()) @@ -262,7 +261,7 @@ private: if (doit) { u32 v; - switch (op.flags & 0x7f) + switch (op.size) { case 1: v = (s32)(::s8)ReadMem8(op.rs1._imm); @@ -513,7 +512,7 @@ private: // There's quite a few of these //printf("%08x +t<< %s\n", block->vaddr + op.guest_offs, op.dissasm().c_str()); op.op = shop_shl; - op.rs2 = shil_param(FMT_IMM, 1); + op.rs2 = shil_param(1); } // a ^ a == 0 // a - a == 0 @@ -526,8 +525,8 @@ private: else if (op.op == shop_sbc) { //printf("%08x ZERO %s\n", block->vaddr + op.guest_offs, op.dissasm().c_str()); - op.rs1 = shil_param(FMT_IMM, 0); - op.rs2 = shil_param(FMT_IMM, 0); + op.rs1 = shil_param(0); + op.rs2 = shil_param(0); stats.prop_constants += 2; } // a & a == a diff --git a/core/hw/sh4/dyna/ssa_regalloc.h b/core/hw/sh4/dyna/ssa_regalloc.h index 025d32117..9533780f0 100644 --- a/core/hw/sh4/dyna/ssa_regalloc.h +++ b/core/hw/sh4/dyna/ssa_regalloc.h @@ -28,7 +28,7 @@ #define ssa_printf(...) DEBUG_LOG(DYNAREC, __VA_ARGS__) -template +template class RegAlloc { public: @@ -78,17 +78,17 @@ public: FlushReg((Sh4RegType)i, true); } // Flush regs used by vector ops - if (op->rs1.is_reg() && op->rs1.count() > 1) + if (op->rs1.is_reg() && op->rs1.count() > MaxVecSize) { for (u32 i = 0; i < op->rs1.count(); i++) FlushReg((Sh4RegType)(op->rs1._reg + i), false); } - if (op->rs2.is_reg() && op->rs2.count() > 1) + if (op->rs2.is_reg() && op->rs2.count() > MaxVecSize) { for (u32 i = 0; i < op->rs2.count(); i++) FlushReg((Sh4RegType)(op->rs2._reg + i), false); } - if (op->rs3.is_reg() && op->rs3.count() > 1) + if (op->rs3.is_reg() && op->rs3.count() > MaxVecSize) { for (u32 i = 0; i < op->rs3.count(); i++) FlushReg((Sh4RegType)(op->rs3._reg + i), false); @@ -100,7 +100,7 @@ public: AllocSourceReg(op->rs3); // Hard flush vector ops destination regs // Note that this is incorrect if a reg is both src (scalar) and dest (vec). However such an op doesn't exist. - if (op->rd.is_reg() && op->rd.count() > 1) + if (op->rd.is_reg() && op->rd.count() > MaxVecSize) { for (u32 i = 0; i < op->rd.count(); i++) { @@ -108,7 +108,7 @@ public: FlushReg((Sh4RegType)(op->rd._reg + i), true); } } - if (op->rd2.is_reg() && op->rd2.count() > 1) + if (op->rd2.is_reg() && op->rd2.count() > MaxVecSize) { for (u32 i = 0; i < op->rd2.count(); i++) { @@ -133,9 +133,7 @@ public: // Flush normally for (auto const& reg : reg_alloced) - { FlushReg(reg.first, false); - } // Hard flush all dirty regs. Useful for troubleshooting // while (!reg_alloced.empty()) @@ -175,7 +173,7 @@ public: bool rv = IsAllocAny(prm._reg); if (prm.count() != 1) { - for (u32 i = 1;i < prm.count(); i++) + for (u32 i = 1; i < prm.count(); i++) verify(IsAllocAny((Sh4RegType)(prm._reg + i)) == rv); } return rv; @@ -190,7 +188,8 @@ public: { if (prm.is_reg()) { - verify(prm.count() == 1); + if (prm.count() > MaxVecSize) + return false; return IsAllocg(prm._reg); } else @@ -203,7 +202,8 @@ public: { if (prm.is_reg()) { - verify(prm.count() == 1); + if (prm.count() > MaxVecSize) + return false; return IsAllocf(prm._reg); } else @@ -219,11 +219,11 @@ public: return mapg(prm._reg); } - nregf_t mapf(const shil_param& prm) + nregf_t mapf(const shil_param& prm, int index = 0) { verify(IsAllocf(prm)); - verify(prm.count() == 1); - return mapf(prm._reg); + verify(prm.count() <= MaxVecSize); + return mapf((Sh4RegType)(prm._reg + index)); } bool reg_used(nreg_t host_reg) @@ -266,6 +266,7 @@ private: bool write_back; bool dirty; }; + static constexpr u32 MaxVecSize = AllocVec2 ? 2 : 1; bool IsFloat(Sh4RegType reg) { @@ -309,11 +310,16 @@ private: { if (!fast_forwarding) { - ssa_printf("WB %s.%d <- %cx", name_reg(reg_num).c_str(), reg_alloc.version, 'a' + reg_alloc.host_reg); if (IsFloat(reg_num)) + { + ssa_printf("WB %s.%d <- xmm%d", name_reg(reg_num).c_str(), reg_alloc.version, reg_alloc.host_reg); Writeback_FPU(reg_num, (nregf_t)reg_alloc.host_reg); + } else + { + ssa_printf("WB %s.%d <- %cx", name_reg(reg_num).c_str(), reg_alloc.version, 'a' + reg_alloc.host_reg); Writeback(reg_num, (nreg_t)reg_alloc.host_reg); + } } reg_alloc.write_back = false; reg_alloc.dirty = false; @@ -354,9 +360,12 @@ private: void AllocSourceReg(const shil_param& param) { - if (param.is_reg() && param.count() == 1) // TODO EXPLODE_SPANS? + if (!param.is_reg() || param.count() > MaxVecSize) + return; + for (u32 i = 0; i < param.count(); i++) { - auto it = reg_alloced.find(param._reg); + Sh4RegType sh4reg = (Sh4RegType)(param._reg + i); + auto it = reg_alloced.find(sh4reg); if (it == reg_alloced.end()) { u32 host_reg; @@ -380,14 +389,19 @@ private: host_reg = host_fregs.back(); host_fregs.pop_back(); } - reg_alloced[param._reg] = { host_reg, param.version[0], false, false }; + reg_alloced[sh4reg] = { host_reg, param.version[i], false, false }; if (!fast_forwarding) { - ssa_printf("PL %s.%d -> %cx", name_reg(param._reg).c_str(), param.version[0], 'a' + host_reg); - if (IsFloat(param._reg)) - Preload_FPU(param._reg, (nregf_t)host_reg); + if (IsFloat(sh4reg)) + { + ssa_printf("PL %s.%d -> xmm%d", name_reg(sh4reg).c_str(), param.version[i], host_reg); + Preload_FPU(sh4reg, (nregf_t)host_reg); + } else - Preload(param._reg, (nreg_t)host_reg); + { + ssa_printf("PL %s.%d -> %cx", name_reg(sh4reg).c_str(), param.version[i], 'a' + host_reg); + Preload(sh4reg, (nreg_t)host_reg); + } } } } @@ -424,9 +438,12 @@ private: void AllocDestReg(const shil_param& param) { - if (param.is_reg() && param.count() == 1) // TODO EXPLODE_SPANS? + if (!param.is_reg() || param.count() > MaxVecSize) + return; + for (u32 i = 0; i < param.count(); i++) { - auto it = reg_alloced.find(param._reg); + Sh4RegType sh4reg = (Sh4RegType)(param._reg + i); + auto it = reg_alloced.find(sh4reg); if (it == reg_alloced.end()) { u32 host_reg; @@ -450,18 +467,21 @@ private: host_reg = host_fregs.back(); host_fregs.pop_back(); } - reg_alloced[param._reg] = { host_reg, param.version[0], NeedsWriteBack(param._reg, param.version[0]), true }; - ssa_printf(" %s.%d -> %cx %s", name_reg(param._reg).c_str(), param.version[0], 'a' + host_reg, reg_alloced[param._reg].write_back ? "(wb)" : ""); + reg_alloced[sh4reg] = { host_reg, param.version[i], NeedsWriteBack(sh4reg, param.version[i]), true }; + if (param.is_r32i()) + ssa_printf(" %s.%d -> %cx %s", name_reg(sh4reg).c_str(), param.version[i], 'a' + host_reg, reg_alloced[sh4reg].write_back ? "(wb)" : ""); + else + ssa_printf(" %s.%d -> xmm%d %s", name_reg(sh4reg).c_str(), param.version[i], host_reg, reg_alloced[sh4reg].write_back ? "(wb)" : ""); } else { - reg_alloc& reg = reg_alloced[param._reg]; + reg_alloc& reg = reg_alloced[sh4reg]; verify(!reg.write_back); - reg.write_back = NeedsWriteBack(param._reg, param.version[0]); + reg.write_back = NeedsWriteBack(sh4reg, param.version[i]); reg.dirty = true; - reg.version = param.version[0]; + reg.version = param.version[i]; } - verify(reg_alloced[param._reg].dirty); + verify(reg_alloced[sh4reg].dirty); } } @@ -544,22 +564,26 @@ private: bool IsVectorOp(shil_opcode* op) { - return op->rs1.count() > 1 || op->rs2.count() > 1 || op->rs3.count() > 1 || op->rd.count() > 1 || op->rd2.count() > 1; + return op->rs1.count() > MaxVecSize + || op->rs2.count() > MaxVecSize + || op->rs3.count() > MaxVecSize + || op->rd.count() > MaxVecSize + || op->rd2.count() > MaxVecSize; } bool UsesReg(shil_opcode* op, Sh4RegType reg, u32 version, bool vector) { if (op->rs1.is_reg() && reg >= op->rs1._reg && reg < (Sh4RegType)(op->rs1._reg + op->rs1.count()) && version == op->rs1.version[reg - op->rs1._reg] - && vector == (op->rs1.count() > 1)) + && vector == (op->rs1.count() > MaxVecSize)) return true; if (op->rs2.is_reg() && reg >= op->rs2._reg && reg < (Sh4RegType)(op->rs2._reg + op->rs2.count()) && version == op->rs2.version[reg - op->rs2._reg] - && vector == (op->rs2.count() > 1)) + && vector == (op->rs2.count() > MaxVecSize)) return true; if (op->rs3.is_reg() && reg >= op->rs3._reg && reg < (Sh4RegType)(op->rs3._reg + op->rs3.count()) && version == op->rs3.version[reg - op->rs3._reg] - && vector == (op->rs3.count() > 1)) + && vector == (op->rs3.count() > MaxVecSize)) return true; return false; @@ -568,10 +592,10 @@ private: bool DefsReg(shil_opcode* op, Sh4RegType reg, bool vector) { if (op->rd.is_reg() && reg >= op->rd._reg && reg < (Sh4RegType)(op->rd._reg + op->rd.count()) - && vector == (op->rd.count() > 1)) + && vector == (op->rd.count() > MaxVecSize)) return true; if (op->rd2.is_reg() && reg >= op->rd2._reg && reg < (Sh4RegType)(op->rd2._reg + op->rd2.count()) - && vector == (op->rd2.count() > 1)) + && vector == (op->rd2.count() > MaxVecSize)) return true; return false; } diff --git a/core/hw/sh4/sh4_mmr.cpp b/core/hw/sh4/sh4_mmr.cpp index f4faa2552..575b7b2dd 100644 --- a/core/hw/sh4/sh4_mmr.cpp +++ b/core/hw/sh4/sh4_mmr.cpp @@ -62,6 +62,7 @@ void sh4_rio_reg(RegisterStruct *arr, u32 addr, RegIO flags, RegReadAddrFP* rf, } else { + verify(!(flags & REG_WO)); // not supported here if (flags & REG_RF) arr[idx].readFunctionAddr = rf; else diff --git a/core/rec-ARM/rec_arm.cpp b/core/rec-ARM/rec_arm.cpp index a511f3abc..6adad38ca 100644 --- a/core/rec-ARM/rec_arm.cpp +++ b/core/rec-ARM/rec_arm.cpp @@ -120,7 +120,7 @@ const int alloc_regs[] = { 5, 6, 7, 10, 11, -1 }; const int alloc_fpu[] = { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, -1 }; -struct arm_reg_alloc: RegAlloc +struct arm_reg_alloc: RegAlloc { void Preload(u32 reg, int nreg) override { @@ -149,9 +149,9 @@ struct arm_reg_alloc: RegAlloc ass.Vstr(SRegister(nreg), MemOperand(r8, shRegOffs)); } - SRegister mapFReg(const shil_param& prm) + SRegister mapFReg(const shil_param& prm, int index = 0) { - return SRegister(mapf(prm)); + return SRegister(mapf(prm, index)); } Register mapReg(const shil_param& prm) { @@ -561,16 +561,15 @@ enum mem_op_type static mem_op_type memop_type(shil_opcode* op) { - int sz = op->flags & 0x7f; bool fp32 = op->rs2.is_r32f() || op->rd.is_r32f(); - if (sz == 1) + if (op->size == 1) return SZ_8; - else if (sz == 2) + else if (op->size == 2) return SZ_16; - else if (sz == 4) + else if (op->size == 4) return fp32 ? SZ_32F : SZ_32I; - else if (sz == 8) + else if (op->size == 8) return SZ_64F; die("Unknown op"); @@ -855,16 +854,15 @@ static bool ngen_readm_immediate(RuntimeBlockInfo* block, shil_opcode* op, bool if (!op->rs1.is_imm()) return false; - u32 size = op->flags & 0x7f; u32 addr = op->rs1._imm; - if (mmu_enabled() && mmu_is_translated(addr, size)) + if (mmu_enabled() && mmu_is_translated(addr, op->size)) { if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12))) // When full mmu is on, only consider addresses in the same 4k page return false; u32 paddr; u32 rv; - switch (size) + switch (op->size) { case 1: rv = mmu_data_translation(addr, paddr); @@ -914,8 +912,16 @@ static bool ngen_readm_immediate(RuntimeBlockInfo* block, shil_opcode* op, bool break; case SZ_64F: - ass.Vldr(d0, MemOperand(r0)); - ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); + if (reg.IsAllocf(op->rd)) + { + ass.Vldr(reg.mapFReg(op->rd, 0), MemOperand(r0)); + ass.Vldr(reg.mapFReg(op->rd, 1), MemOperand(r0, 4)); + } + else + { + ass.Vldr(d0, MemOperand(r0)); + ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); + } break; } } @@ -928,11 +934,17 @@ static bool ngen_readm_immediate(RuntimeBlockInfo* block, shil_opcode* op, bool // Need to call the handler twice ass.Mov(r0, op->rs1._imm); call(ptr); - ass.Str(r0, MemOperand(r8, op->rd.reg_nofs())); + if (reg.IsAllocf(op->rd)) + ass.Vmov(reg.mapFReg(op->rd, 0), r0); + else + ass.Str(r0, MemOperand(r8, op->rd.reg_nofs())); ass.Mov(r0, op->rs1._imm + 4); call(ptr); - ass.Str(r0, MemOperand(r8, op->rd.reg_nofs() + 4)); + if (reg.IsAllocf(op->rd)) + ass.Vmov(reg.mapFReg(op->rd, 1), r0); + else + ass.Str(r0, MemOperand(r8, op->rd.reg_nofs() + 4)); } else { @@ -975,16 +987,15 @@ static bool ngen_writemem_immediate(RuntimeBlockInfo* block, shil_opcode* op, bo if (!op->rs1.is_imm()) return false; - u32 size = op->flags & 0x7f; u32 addr = op->rs1._imm; - if (mmu_enabled() && mmu_is_translated(addr, size)) + if (mmu_enabled() && mmu_is_translated(addr, op->size)) { if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12))) // When full mmu is on, only consider addresses in the same 4k page return false; u32 paddr; u32 rv; - switch (size) + switch (op->size) { case 1: rv = mmu_data_translation(addr, paddr); @@ -1041,8 +1052,16 @@ static bool ngen_writemem_immediate(RuntimeBlockInfo* block, shil_opcode* op, bo break; case SZ_64F: - ass.Vldr(d0, MemOperand(r8, op->rs2.reg_nofs())); - ass.Vstr(d0, MemOperand(r0)); + if (reg.IsAllocf(op->rs2)) + { + ass.Vstr(reg.mapFReg(op->rs2, 0), MemOperand(r0)); + ass.Vstr(reg.mapFReg(op->rs2, 1), MemOperand(r0, 4)); + } + else + { + ass.Vldr(d0, MemOperand(r8, op->rs2.reg_nofs())); + ass.Vstr(d0, MemOperand(r0)); + } break; default: @@ -1157,9 +1176,20 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o case SZ_64F: ass.Add(r1, r1, r8); //3 opcodes, there's no [REG+REG] VLDR - ass.Vldr(d0, MemOperand(r1)); //TODO: use reg alloc - - ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); + ass.Vldr(d0, MemOperand(r1)); + if (reg.IsAllocf(op->rd)) + { + ass.Vmov(r0, r1, d0); + ass.Vmov(reg.mapFReg(op->rd, 0), r0); + ass.Vmov(reg.mapFReg(op->rd, 1), r1); + // easier to do just this but we need to use a different op than 32f to distinguish during rewrite + //ass.Vldr(reg.mapFReg(op->rd, 0), MemOperand(r1)); + //ass.Vldr(reg.mapFReg(op->rd, 1), MemOperand(r1, 4)); + } + else + { + ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); + } break; } } else { @@ -1183,7 +1213,16 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o case SZ_64F: vmem_slowpath(raddr, r0, s0, d0, optp, true); - ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); + if (reg.IsAllocf(op->rd)) + { + ass.Vmov(r0, r1, d0); + ass.Vmov(reg.mapFReg(op->rd, 0), r0); + ass.Vmov(reg.mapFReg(op->rd, 1), r1); + } + else + { + ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); + } break; } } @@ -1201,9 +1240,19 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o Register rs2 = r2; SRegister rs2f = s2; - //TODO: use reg alloc if (optp == SZ_64F) - ass.Vldr(d0, MemOperand(r8, op->rs2.reg_nofs())); + { + if (reg.IsAllocf(op->rs2)) + { + ass.Vmov(r2, reg.mapFReg(op->rs2, 0)); + ass.Vmov(r3, reg.mapFReg(op->rs2, 1)); + ass.Vmov(d0, r2, r3); + } + else + { + ass.Vldr(d0, MemOperand(r8, op->rs2.reg_nofs())); + } + } else if (op->rs2.is_imm()) { ass.Mov(rs2, op->rs2._imm); @@ -1242,7 +1291,7 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o case SZ_64F: ass.Add(r1, r1, r8); //3 opcodes: there's no [REG+REG] VLDR, also required for SQ - ass.Vstr(d0, MemOperand(r1)); //TODO: use reg alloc + ass.Vstr(d0, MemOperand(r1)); break; } } else { @@ -1358,9 +1407,18 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o break; case shop_mov64: - verify(op->rs1.is_r64() && op->rd.is_r64()); - ass.Vldr(d0, MemOperand(r8, op->rs1.reg_nofs())); - ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); + verify(op->rs1.is_r64f() && op->rd.is_r64f()); + if (reg.IsAllocf(op->rd)) + { + verify(reg.IsAllocf(op->rs1)); + ass.Vmov(reg.mapFReg(op->rd, 0), reg.mapFReg(op->rs1, 0)); + ass.Vmov(reg.mapFReg(op->rd, 1), reg.mapFReg(op->rs1, 1)); + } + else + { + ass.Vldr(d0, MemOperand(r8, op->rs1.reg_nofs())); + ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); + } break; case shop_jcond: @@ -1821,8 +1879,16 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o ass.Add(r0, r1, Operand(r0, LSL, 3)); - ass.Vldr(d0, MemOperand(r0)); - ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); + if (reg.IsAllocf(op->rd)) + { + ass.Vldr(reg.mapFReg(op->rd, 0), MemOperand(r0)); + ass.Vldr(reg.mapFReg(op->rd, 1), MemOperand(r0, 4)); + } + else + { + ass.Vldr(d0, MemOperand(r0)); + ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs())); + } break; case shop_fipr: diff --git a/core/rec-ARM64/arm64_regalloc.h b/core/rec-ARM64/arm64_regalloc.h index b8b9f3755..6882effd1 100644 --- a/core/rec-ARM64/arm64_regalloc.h +++ b/core/rec-ARM64/arm64_regalloc.h @@ -21,6 +21,8 @@ #include using namespace vixl::aarch64; +#define ALLOC_F64 true + enum eReg { W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, W16, W17, W18, W19, W20, W21, W22, W23, W24, W25, W26, W27, W28, W29, W30 @@ -35,7 +37,7 @@ static eFReg alloc_fregs[] = { S8, S9, S10, S11, S12, S13, S14, S15, (eFReg)-1 } class Arm64Assembler; -struct Arm64RegAlloc : RegAlloc +struct Arm64RegAlloc : RegAlloc { Arm64RegAlloc(Arm64Assembler *assembler) : assembler(assembler) {} @@ -57,9 +59,9 @@ struct Arm64RegAlloc : RegAlloc return Register::GetWRegFromCode(ereg); } - const VRegister& MapVRegister(const shil_param& param) + const VRegister& MapVRegister(const shil_param& param, int index = 0) { - eFReg ereg = mapf(param); + eFReg ereg = mapf(param, index); if (ereg == (eFReg)-1) die("VRegister not allocated"); return VRegister::GetSRegFromCode(ereg); diff --git a/core/rec-ARM64/rec_arm64.cpp b/core/rec-ARM64/rec_arm64.cpp index fb6354699..c3dfb0372 100644 --- a/core/rec-ARM64/rec_arm64.cpp +++ b/core/rec-ARM64/rec_arm64.cpp @@ -387,11 +387,28 @@ public: break; case shop_mov64: - verify(op.rd.is_reg()); - verify(op.rs1.is_reg() || op.rs1.is_imm()); + { + verify(op.rd.is_reg()); + verify(op.rs1.is_reg() || op.rs1.is_imm()); - shil_param_to_host_reg(op.rs1, x15); - host_reg_to_shil_param(op.rd, x15); + if (!regalloc.IsAllocf(op.rd)) + { + verify(!regalloc.IsAllocf(op.rs1)); + shil_param_to_host_reg(op.rs1, x15); + host_reg_to_shil_param(op.rd, x15); + } + else + { + const VRegister& rd0 = regalloc.MapVRegister(op.rd, 0); + const VRegister& rs0 = regalloc.MapVRegister(op.rs1, 0); + if (!rd0.Is(rs0)) + Fmov(rd0, rs0); + const VRegister& rd1 = regalloc.MapVRegister(op.rd, 1); + const VRegister& rs1 = regalloc.MapVRegister(op.rs1, 1); + if (!rd1.Is(rs1)) + Fmov(rd1, rs1); + } + } break; case shop_readm: @@ -904,8 +921,15 @@ public: Add(x1, x1, Operand(regalloc.MapRegister(op.rs1), UXTH, 3)); else Add(x1, x1, Operand(op.rs1.imm_value() << 3)); - Ldr(x2, MemOperand(x1)); - Str(x2, sh4_context_mem_operand(op.rd.reg_ptr())); + if (regalloc.IsAllocf(op.rd)) + { + Ldp(regalloc.MapVRegister(op.rd, 0), regalloc.MapVRegister(op.rd, 1), MemOperand(x1)); + } + else + { + Ldr(x2, MemOperand(x1)); + Str(x2, sh4_context_mem_operand(op.rd.reg_ptr())); + } break; case shop_fipr: @@ -1659,14 +1683,13 @@ private: GenMemAddr(op, &w0); genMmuLookup(op, 0); - u32 size = op.flags & 0x7f; if (!optimise || !GenReadMemoryFast(op, opid)) - GenReadMemorySlow(size); + GenReadMemorySlow(op.size); - if (size < 8) + if (op.size < 8) host_reg_to_shil_param(op.rd, w0); else - Str(x0, sh4_context_mem_operand(op.rd.reg_ptr())); + host_reg_to_shil_param(op.rd, x0); } bool GenReadMemoryImmediate(const shil_opcode& op) @@ -1674,16 +1697,15 @@ private: if (!op.rs1.is_imm()) return false; - u32 size = op.flags & 0x7f; u32 addr = op.rs1._imm; - if (mmu_enabled() && mmu_is_translated(addr, size)) + if (mmu_enabled() && mmu_is_translated(addr, op.size)) { if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12))) // When full mmu is on, only consider addresses in the same 4k page return false; u32 paddr; u32 rv; - switch (size) + switch (op.size) { case 1: rv = mmu_data_translation(addr, paddr); @@ -1705,14 +1727,14 @@ private: addr = paddr; } bool isram = false; - void* ptr = _vmem_read_const(addr, isram, size > 4 ? 4 : size); + void* ptr = _vmem_read_const(addr, isram, op.size > 4 ? 4 : op.size); if (isram) { Ldr(x1, reinterpret_cast(ptr)); // faster than Mov if (regalloc.IsAllocAny(op.rd)) { - switch (size) + switch (op.size) { case 1: Ldrsb(regalloc.MapRegister(op.rd), MemOperand(x1)); @@ -1729,6 +1751,10 @@ private: Ldr(regalloc.MapRegister(op.rd), MemOperand(x1)); break; + case 8: + Ldp(regalloc.MapVRegister(op.rd, 0), regalloc.MapVRegister(op.rd, 1), MemOperand(x1)); + break; + default: die("Invalid size"); break; @@ -1736,7 +1762,7 @@ private: } else { - switch (size) + switch (op.size) { case 1: Ldrsb(w1, MemOperand(x1)); @@ -1758,7 +1784,7 @@ private: die("Invalid size"); break; } - if (size == 8) + if (op.size == 8) Str(x1, sh4_context_mem_operand(op.rd.reg_ptr())); else Str(w1, sh4_context_mem_operand(op.rd.reg_ptr())); @@ -1767,23 +1793,28 @@ private: else { // Not RAM - if (size == 8) + if (op.size == 8) { - verify(!regalloc.IsAllocAny(op.rd)); // Need to call the handler twice Mov(w0, addr); GenCallRuntime((void (*)())ptr); - Str(w0, sh4_context_mem_operand(op.rd.reg_ptr())); + if (regalloc.IsAllocf(op.rd)) + Fmov(regalloc.MapVRegister(op.rd, 0), w0); + else + Str(w0, sh4_context_mem_operand(op.rd.reg_ptr())); Mov(w0, addr + 4); GenCallRuntime((void (*)())ptr); - Str(w0, sh4_context_mem_operand((u8*)op.rd.reg_ptr() + 4)); + if (regalloc.IsAllocf(op.rd)) + Fmov(regalloc.MapVRegister(op.rd, 1), w0); + else + Str(w0, sh4_context_mem_operand((u8*)op.rd.reg_ptr() + 4)); } else { Mov(w0, addr); - switch(size) + switch(op.size) { case 1: GenCallRuntime((void (*)())ptr); @@ -1830,8 +1861,7 @@ private: Ubfx(x1, x0, 0, 29); Add(x1, x1, sizeof(Sh4Context), LeaveFlags); - u32 size = op.flags & 0x7f; - switch(size) + switch (op.size) { case 1: Ldrsb(w0, MemOperand(x28, x1)); @@ -1862,15 +1892,14 @@ private: GenMemAddr(op, &w0); genMmuLookup(op, 1); - u32 size = op.flags & 0x7f; - if (size != 8) + if (op.size != 8) shil_param_to_host_reg(op.rs2, w1); else shil_param_to_host_reg(op.rs2, x1); if (optimise && GenWriteMemoryFast(op, opid)) return; - GenWriteMemorySlow(size); + GenWriteMemorySlow(op.size); } bool GenWriteMemoryImmediate(const shil_opcode& op) @@ -1878,16 +1907,15 @@ private: if (!op.rs1.is_imm()) return false; - u32 size = op.flags & 0x7f; u32 addr = op.rs1._imm; - if (mmu_enabled() && mmu_is_translated(addr, size)) + if (mmu_enabled() && mmu_is_translated(addr, op.size)) { if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12))) // When full mmu is on, only consider addresses in the same 4k page return false; u32 paddr; u32 rv; - switch (size) + switch (op.size) { case 1: rv = mmu_data_translation(addr, paddr); @@ -1909,11 +1937,11 @@ private: addr = paddr; } bool isram = false; - void* ptr = _vmem_write_const(addr, isram, size > 4 ? 4 : size); + void* ptr = _vmem_write_const(addr, isram, op.size > 4 ? 4 : op.size); - Register reg2; - if (size != 8) + if (isram) { + Register reg2; if (op.rs2.is_imm()) { Mov(w1, op.rs2._imm); @@ -1923,6 +1951,11 @@ private: { reg2 = regalloc.MapRegister(op.rs2); } + else if (op.size == 8) + { + shil_param_to_host_reg(op.rs2, x1); + reg2 = x1; + } else if (regalloc.IsAllocf(op.rs2)) { Fmov(w1, regalloc.MapVRegister(op.rs2)); @@ -1930,11 +1963,9 @@ private: } else die("Invalid rs2 param"); - } - if (isram) - { + Ldr(x0, reinterpret_cast(ptr)); - switch (size) + switch (op.size) { case 1: Strb(reg2, MemOperand(x0)); @@ -1949,8 +1980,7 @@ private: break; case 8: - shil_param_to_host_reg(op.rs2, x1); - Str(x1, MemOperand(x0)); + Str(reg2, MemOperand(x0)); break; default: @@ -1962,10 +1992,10 @@ private: { // Not RAM Mov(w0, addr); - if (size == 8) + shil_param_to_host_reg(op.rs2, x1); + if (op.size == 8) { // Need to call the handler twice - shil_param_to_host_reg(op.rs2, x1); GenCallRuntime((void (*)())ptr); Mov(w0, addr + 4); @@ -1975,7 +2005,6 @@ private: } else { - Mov(w1, reg2); GenCallRuntime((void (*)())ptr); } } @@ -1996,8 +2025,7 @@ private: Ubfx(x7, x0, 0, 29); Add(x7, x7, sizeof(Sh4Context), LeaveFlags); - u32 size = op.flags & 0x7f; - switch(size) + switch(op.size) { case 1: Strb(w1, MemOperand(x28, x7)); @@ -2112,21 +2140,28 @@ private: } else if (param.is_reg()) { - if (param.is_r64f()) + if (param.is_r64f() && !regalloc.IsAllocf(param)) + { Ldr(reg, sh4_context_mem_operand(param.reg_ptr())); - else if (param.is_r32f()) + } + else if (param.is_r32f() || param.is_r64f()) { if (regalloc.IsAllocf(param)) - Fmov(reg, regalloc.MapVRegister(param)); + Fmov(reg.W(), regalloc.MapVRegister(param, 0)); else - Ldr(reg, sh4_context_mem_operand(param.reg_ptr())); + Ldr(reg.W(), sh4_context_mem_operand(param.reg_ptr())); + if (param.is_r64f()) + { + Fmov(w15, regalloc.MapVRegister(param, 1)); + Bfm(reg, x15, 32, 31); + } } else { if (regalloc.IsAllocg(param)) - Mov(reg, regalloc.MapRegister(param)); + Mov(reg.W(), regalloc.MapRegister(param)); else - Ldr(reg, sh4_context_mem_operand(param.reg_ptr())); + Ldr(reg.W(), sh4_context_mem_operand(param.reg_ptr())); } } else @@ -2139,7 +2174,17 @@ private: { if (reg.Is64Bits()) { - Str((const Register&)reg, sh4_context_mem_operand(param.reg_ptr())); + if (regalloc.IsAllocf(param)) + { + verify(param.count() == 2); + Fmov(regalloc.MapVRegister(param, 0), reg.W()); + Lsr(reg.X(), reg.X(), 32); + Fmov(regalloc.MapVRegister(param, 1), reg.W()); + } + else + { + Str((const Register&)reg, sh4_context_mem_operand(param.reg_ptr())); + } } else if (regalloc.IsAllocg(param)) { diff --git a/core/rec-cpp/rec_cpp.cpp b/core/rec-cpp/rec_cpp.cpp index 4189d7a86..b9a012319 100644 --- a/core/rec-cpp/rec_cpp.cpp +++ b/core/rec-cpp/rec_cpp.cpp @@ -1665,7 +1665,7 @@ public: case shop_readm: { - u32 size = op.flags & 0x7f; + u32 size = op.size; if (op.rs1.is_imm()) { verify(op.rs2.is_null() && op.rs3.is_null()); @@ -1748,7 +1748,7 @@ public: case shop_writem: { - u32 size = op.flags & 0x7f; + u32 size = op.size; if (op.rs1.is_imm()) { verify(op.rs3.is_null()); diff --git a/core/rec-x64/rec_x64.cpp b/core/rec-x64/rec_x64.cpp index 7ba4dfce8..d41007657 100644 --- a/core/rec-x64/rec_x64.cpp +++ b/core/rec-x64/rec_x64.cpp @@ -215,13 +215,24 @@ public: case shop_mov64: { - verify(op.rd.is_r64()); - verify(op.rs1.is_r64()); + verify(op.rd.is_r64f()); + verify(op.rs1.is_r64f()); +#if ALLOC_F64 == false mov(rax, (uintptr_t)op.rs1.reg_ptr()); mov(rax, qword[rax]); mov(rcx, (uintptr_t)op.rd.reg_ptr()); mov(qword[rcx], rax); +#else + Xbyak::Xmm rd = regalloc.MapXRegister(op.rd, 0); + Xbyak::Xmm rs = regalloc.MapXRegister(op.rs1, 0); + if (rd != rs) + movss(rd, rs); + rd = regalloc.MapXRegister(op.rd, 1); + rs = regalloc.MapXRegister(op.rs1, 1); + if (rd != rs) + movss(rd, rs); +#endif } break; @@ -244,16 +255,18 @@ public: } genMmuLookup(block, op, 0); - int size = op.flags & 0x7f; - size = size == 1 ? MemSize::S8 : size == 2 ? MemSize::S16 : size == 4 ? MemSize::S32 : MemSize::S64; + int size = op.size == 1 ? MemSize::S8 : op.size == 2 ? MemSize::S16 : op.size == 4 ? MemSize::S32 : MemSize::S64; GenCall((void (*)())MemHandlers[optimise ? MemType::Fast : MemType::Slow][size][MemOp::R], mmu_enabled()); - if (size != MemSize::S64) - host_reg_to_shil_param(op.rd, eax); - else { +#if ALLOC_F64 == false + if (size == MemSize::S64) + { mov(rcx, (uintptr_t)op.rd.reg_ptr()); mov(qword[rcx], rax); } + else +#endif + host_reg_to_shil_param(op.rd, rax); } break; @@ -276,15 +289,17 @@ public: } genMmuLookup(block, op, 1); - u32 size = op.flags & 0x7f; - if (size != 8) - shil_param_to_host_reg(op.rs2, call_regs[1]); - else { +#if ALLOC_F64 == false + if (op.size == 8) + { mov(rax, (uintptr_t)op.rs2.reg_ptr()); mov(call_regs64[1], qword[rax]); } + else +#endif + shil_param_to_host_reg(op.rs2, call_regs64[1]); - size = size == 1 ? MemSize::S8 : size == 2 ? MemSize::S16 : size == 4 ? MemSize::S32 : MemSize::S64; + int size = op.size == 1 ? MemSize::S8 : op.size == 2 ? MemSize::S16 : op.size == 4 ? MemSize::S32 : MemSize::S64; GenCall((void (*)())MemHandlers[optimise ? MemType::Fast : MemType::Slow][size][MemOp::W], mmu_enabled()); } } @@ -809,9 +824,8 @@ private: { if (!op.rs1.is_imm()) return false; - u32 size = op.flags & 0x7f; u32 addr = op.rs1._imm; - if (mmu_enabled() && mmu_is_translated(addr, size)) + if (mmu_enabled() && mmu_is_translated(addr, op.size)) { if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12))) // When full mmu is on, only consider addresses in the same 4k page @@ -819,7 +833,7 @@ private: u32 paddr; u32 rv; - switch (size) + switch (op.size) { case 1: rv = mmu_data_translation(addr, paddr); @@ -841,13 +855,13 @@ private: addr = paddr; } bool isram = false; - void* ptr = _vmem_read_const(addr, isram, size > 4 ? 4 : size); + void* ptr = _vmem_read_const(addr, isram, op.size > 4 ? 4 : op.size); if (isram) { // Immediate pointer to RAM: super-duper fast access mov(rax, reinterpret_cast(ptr)); - switch (size) + switch (op.size) { case 1: if (regalloc.IsAllocg(op.rd)) @@ -885,9 +899,14 @@ private: break; case 8: +#if ALLOC_F64 == false mov(rcx, qword[rax]); mov(rax, (uintptr_t)op.rd.reg_ptr()); mov(qword[rax], rcx); +#else + movd(regalloc.MapXRegister(op.rd, 0), dword[rax]); + movd(regalloc.MapXRegister(op.rd, 1), dword[rax + 4]); +#endif break; default: @@ -898,26 +917,32 @@ private: else { // Not RAM: the returned pointer is a memory handler - if (size == 8) + if (op.size == 8) { - verify(!regalloc.IsAllocAny(op.rd)); - // Need to call the handler twice mov(call_regs[0], addr); GenCall((void (*)())ptr); +#if ALLOC_F64 == false mov(rcx, (size_t)op.rd.reg_ptr()); mov(dword[rcx], eax); +#else + mov(regalloc.MapXRegister(op.rd, 0), eax); +#endif mov(call_regs[0], addr + 4); GenCall((void (*)())ptr); +#if ALLOC_F64 == false mov(rcx, (size_t)op.rd.reg_ptr() + 4); mov(dword[rcx], eax); +#else + mov(regalloc.MapXRegister(op.rd, 1), eax); +#endif } else { mov(call_regs[0], addr); - switch(size) + switch(op.size) { case 1: GenCall((void (*)())ptr); @@ -948,9 +973,8 @@ private: { if (!op.rs1.is_imm()) return false; - u32 size = op.flags & 0x7f; u32 addr = op.rs1._imm; - if (mmu_enabled() && mmu_is_translated(addr, size)) + if (mmu_enabled() && mmu_is_translated(addr, op.size)) { if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12))) // When full mmu is on, only consider addresses in the same 4k page @@ -958,7 +982,7 @@ private: u32 paddr; u32 rv; - switch (size) + switch (op.size) { case 1: rv = mmu_data_translation(addr, paddr); @@ -980,13 +1004,13 @@ private: addr = paddr; } bool isram = false; - void* ptr = _vmem_write_const(addr, isram, size > 4 ? 4 : size); + void* ptr = _vmem_write_const(addr, isram, op.size > 4 ? 4 : op.size); if (isram) { // Immediate pointer to RAM: super-duper fast access mov(rax, reinterpret_cast(ptr)); - switch (size) + switch (op.size) { case 1: if (regalloc.IsAllocg(op.rs2)) @@ -1030,9 +1054,14 @@ private: break; case 8: +#if ALLOC_F64 == false mov(rcx, (uintptr_t)op.rs2.reg_ptr()); mov(rcx, qword[rcx]); mov(qword[rax], rcx); +#else + movd(dword[rax], regalloc.MapXRegister(op.rs2, 0)); + movd(dword[rax + 4], regalloc.MapXRegister(op.rs2, 1)); +#endif break; default: diff --git a/core/rec-x64/x64_regalloc.h b/core/rec-x64/x64_regalloc.h index bf5c06dc1..af1cdb429 100644 --- a/core/rec-x64/x64_regalloc.h +++ b/core/rec-x64/x64_regalloc.h @@ -25,15 +25,18 @@ static Xbyak::Operand::Code alloc_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::RDI, Xbyak::Operand::RSI, Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15, (Xbyak::Operand::Code)-1 }; static s8 alloc_fregs[] = { 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1 }; // XMM6 to XMM15 are callee-saved in Windows +#define ALLOC_F64 true #else static Xbyak::Operand::Code alloc_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15, (Xbyak::Operand::Code)-1 }; static s8 alloc_fregs[] = { 8, 9, 10, 11, -1 }; // XMM8-11 +// all xmm registers are caller-saved on linux +#define ALLOC_F64 false #endif class BlockCompiler; -struct X64RegAlloc : RegAlloc +struct X64RegAlloc : RegAlloc { X64RegAlloc(BlockCompiler *compiler) : compiler(compiler) {} @@ -55,9 +58,9 @@ struct X64RegAlloc : RegAlloc return Xbyak::Reg32(ereg); } - Xbyak::Xmm MapXRegister(const shil_param& param) + Xbyak::Xmm MapXRegister(const shil_param& param, int index = 0) { - s8 ereg = mapf(param); + s8 ereg = mapf(param, index); if (ereg == -1) die("VRegister not allocated"); return Xbyak::Xmm(ereg); diff --git a/core/rec-x64/xbyak_base.h b/core/rec-x64/xbyak_base.h index 3bdd0569b..4fa95debf 100644 --- a/core/rec-x64/xbyak_base.h +++ b/core/rec-x64/xbyak_base.h @@ -587,13 +587,19 @@ protected: #ifndef XBYAK32 mov(rcx, (uintptr_t)&sin_table); mov(rcx, qword[rcx + rax * 8]); +#if ALLOC_F64 == false mov(rdx, (uintptr_t)op.rd.reg_ptr()); mov(qword[rdx], rcx); +#else + movd(mapXRegister(op.rd, 0), ecx); + shr(rcx, 32); + movd(mapXRegister(op.rd, 1), ecx); +#endif #endif } else { -#ifdef EXPLODE_SPANS +#if ALLOC_F64 == true movss(mapXRegister(op.rd, 0), dword[(size_t)&sin_table + eax * 8]); movss(mapXRegister(op.rd, 1), dword[(size_t)&sin_table[0].u[1] + eax * 8]); #else @@ -653,15 +659,25 @@ protected: } else if (param.is_reg()) { - if (param.is_r32f()) + if (isAllocf(param)) { - if (isAllocf(param)) + if (param.is_r32f() || param.is_r64f()) { - Xbyak::Xmm sreg = mapXRegister(param); + Xbyak::Xmm sreg = mapXRegister(param, 0); if (!reg.isXMM()) - movd((const Xbyak::Reg32 &)reg, sreg); + movd(reg.cvt32(), sreg); else if (reg != sreg) movss((const Xbyak::Xmm &)reg, sreg); +#ifndef XBYAK32 + if (param.is_r64f()) + { + sreg = mapXRegister(param, 1); + verify(reg != rax); + movd(eax, sreg); + shl(rax, 32); + or_(reg, rax); + } +#endif } else { @@ -670,44 +686,41 @@ protected: { #ifndef XBYAK32 mov(rax, (size_t)param.reg_ptr()); - mov((const Xbyak::Reg32 &)reg, dword[rax]); + mov(reg.cvt32(), dword[rax]); #endif } else { - mov((const Xbyak::Reg32 &)reg, dword[param.reg_ptr()]); + mov(reg.cvt32(), dword[param.reg_ptr()]); } } } + else if (isAllocg(param)) + { + Xbyak::Reg32 sreg = mapRegister(param); + if (reg.isXMM()) + movd((const Xbyak::Xmm &)reg, sreg); + else if (reg != sreg) + mov(reg.cvt32(), sreg); + } else { - if (isAllocg(param)) + if (ArchX64) { - Xbyak::Reg32 sreg = mapRegister(param); - if (reg.isXMM()) - movd((const Xbyak::Xmm &)reg, sreg); - else if (reg != sreg) - mov((const Xbyak::Reg32 &)reg, sreg); +#ifndef XBYAK32 + mov(rax, (size_t)param.reg_ptr()); + if (!reg.isXMM()) + mov(reg.cvt32(), dword[rax]); + else + movss((const Xbyak::Xmm &)reg, dword[rax]); +#endif } else { - if (ArchX64) - { -#ifndef XBYAK32 - mov(rax, (size_t)param.reg_ptr()); - if (!reg.isXMM()) - mov((const Xbyak::Reg32 &)reg, dword[rax]); - else - movss((const Xbyak::Xmm &)reg, dword[rax]); -#endif - } + if (!reg.isXMM()) + mov(reg.cvt32(), dword[param.reg_ptr()]); else - { - if (!reg.isXMM()) - mov((const Xbyak::Reg32 &)reg, dword[param.reg_ptr()]); - else - movss((const Xbyak::Xmm &)reg, dword[param.reg_ptr()]); - } + movss((const Xbyak::Xmm &)reg, dword[param.reg_ptr()]); } } } @@ -724,17 +737,25 @@ protected: { Xbyak::Reg32 sreg = mapRegister(param); if (!reg.isXMM()) - mov(sreg, (const Xbyak::Reg32 &)reg); + mov(sreg, reg.cvt32()); else if (reg != sreg) movd(sreg, (const Xbyak::Xmm &)reg); } else if (isAllocf(param)) { - Xbyak::Xmm sreg = mapXRegister(param); + Xbyak::Xmm sreg = mapXRegister(param, 0); if (!reg.isXMM()) - movd(sreg, (const Xbyak::Reg32 &)reg); + movd(sreg, reg.cvt32()); else if (reg != sreg) movss(sreg, (const Xbyak::Xmm &)reg); +#ifndef XBYAK32 + if (param.is_r64f()) + { + sreg = mapXRegister(param, 1); + shr(reg, 32); + movd(sreg, reg.cvt32()); + } +#endif } else { @@ -743,7 +764,7 @@ protected: #ifndef XBYAK32 mov(rax, (size_t)param.reg_ptr()); if (!reg.isXMM()) - mov(dword[rax], (const Xbyak::Reg32 &)reg); + mov(dword[rax], reg.cvt32()); else movss(dword[rax], (const Xbyak::Xmm &)reg); #endif @@ -751,7 +772,7 @@ protected: else { if (!reg.isXMM()) - mov(dword[param.reg_ptr()], (const Xbyak::Reg32 &)reg); + mov(dword[param.reg_ptr()], reg.cvt32()); else movss(dword[param.reg_ptr()], (const Xbyak::Xmm &)reg); } @@ -763,16 +784,16 @@ private: return static_cast(this)->regalloc.MapRegister(param); } - Xbyak::Xmm mapXRegister(const shil_param& param) { - return static_cast(this)->regalloc.MapXRegister(param); + Xbyak::Xmm mapXRegister(const shil_param& param, int index = 0) { + return static_cast(this)->regalloc.MapXRegister(param, index); } int mapg(const shil_param& param) { return (int)static_cast(this)->regalloc.mapg(param); } - int mapf(const shil_param& param) { - return (int)static_cast(this)->regalloc.mapf(param); + int mapf(const shil_param& param, int index = 0) { + return (int)static_cast(this)->regalloc.mapf(param, index); } bool isAllocg(const shil_param& param) { diff --git a/core/rec-x86/rec_x86.cpp b/core/rec-x86/rec_x86.cpp index 9ec8e5d4b..77170380c 100644 --- a/core/rec-x86/rec_x86.cpp +++ b/core/rec-x86/rec_x86.cpp @@ -526,15 +526,14 @@ bool X86Compiler::genReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* b { if (!op.rs1.is_imm()) return false; - u32 size = op.flags & 0x7f; u32 addr = op.rs1.imm_value(); bool isram = false; - void* ptr = _vmem_read_const(addr, isram, size > 4 ? 4 : size); + void* ptr = _vmem_read_const(addr, isram, op.size > 4 ? 4 : op.size); if (isram) { // Immediate pointer to RAM: super-duper fast access - switch (size) + switch (op.size) { case 1: if (regalloc.IsAllocg(op.rd)) @@ -569,14 +568,12 @@ bool X86Compiler::genReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* b break; case 8: -#ifdef EXPLODE_SPANS - if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1)) + if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd)) { movd(regalloc.MapXRegister(op.rd, 0), dword[ptr]); movd(regalloc.MapXRegister(op.rd, 1), dword[(u32 *)ptr + 1]); } else -#endif { movq(xmm0, qword[ptr]); movq(qword[op.rd.reg_ptr()], xmm0); @@ -591,7 +588,7 @@ bool X86Compiler::genReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* b else { // Not RAM: the returned pointer is a memory handler - if (size == 8) + if (op.size == 8) { verify(!regalloc.IsAllocAny(op.rd)); @@ -608,7 +605,7 @@ bool X86Compiler::genReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* b { mov(ecx, addr); - switch(size) + switch(op.size) { case 1: genCall((void (DYNACALL *)())ptr); @@ -639,15 +636,14 @@ bool X86Compiler::genWriteMemImmediate(const shil_opcode& op, RuntimeBlockInfo* { if (!op.rs1.is_imm()) return false; - u32 size = op.flags & 0x7f; u32 addr = op.rs1.imm_value(); bool isram = false; - void* ptr = _vmem_write_const(addr, isram, size > 4 ? 4 : size); + void* ptr = _vmem_write_const(addr, isram, op.size > 4 ? 4 : op.size); if (isram) { // Immediate pointer to RAM: super-duper fast access - switch (size) + switch (op.size) { case 1: if (regalloc.IsAllocg(op.rs2)) @@ -697,14 +693,12 @@ bool X86Compiler::genWriteMemImmediate(const shil_opcode& op, RuntimeBlockInfo* break; case 8: -#ifdef EXPLODE_SPANS - if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1)) + if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2)) { movd(dword[ptr], regalloc.MapXRegister(op.rs2, 0)); movd(dword[(u32 *)ptr + 1], regalloc.MapXRegister(op.rs2, 1)); } else -#endif { movq(xmm0, qword[op.rs2.reg_ptr()]); movq(qword[ptr], xmm0); diff --git a/core/rec-x86/x86_ops.cpp b/core/rec-x86/x86_ops.cpp index f6657f299..359b6bf16 100644 --- a/core/rec-x86/x86_ops.cpp +++ b/core/rec-x86/x86_ops.cpp @@ -268,10 +268,10 @@ void X86Compiler::genOpcode(RuntimeBlockInfo* block, bool optimise, shil_opcode& break; case shop_mov64: - verify(op.rd.is_r64()); - verify(op.rs1.is_r64()); + verify(op.rd.is_r64f()); + verify(op.rs1.is_r64f()); -#ifdef EXPLODE_SPANS +#if ALLOC_F64 == true movss(regalloc.MapXRegister(op.rd, 0), regalloc.MapXRegister(op.rs1, 0)); movss(regalloc.MapXRegister(op.rd, 1), regalloc.MapXRegister(op.rs1, 1)); #else @@ -297,7 +297,7 @@ void X86Compiler::genOpcode(RuntimeBlockInfo* block, bool optimise, shil_opcode& } int memOpSize; - switch (op.flags & 0x7f) + switch (op.size) { case 1: memOpSize = MemSize::S8; @@ -329,14 +329,12 @@ void X86Compiler::genOpcode(RuntimeBlockInfo* block, bool optimise, shil_opcode& } else { -#ifdef EXPLODE_SPANS - if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1)) + if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd)) { mov(regalloc.MapXRegister(op.rd, 0), xmm0); mov(regalloc.MapXRegister(op.rd, 1), xmm1); } else -#endif { verify(!regalloc.IsAllocAny(op.rd)); movss(dword[op.rd.reg_ptr()], xmm0); @@ -361,7 +359,7 @@ void X86Compiler::genOpcode(RuntimeBlockInfo* block, bool optimise, shil_opcode& } int memOpSize; - switch (op.flags & 0x7f) + switch (op.size) { case 1: memOpSize = MemSize::S8; @@ -382,14 +380,12 @@ void X86Compiler::genOpcode(RuntimeBlockInfo* block, bool optimise, shil_opcode& else if (memOpSize == MemSize::F32) shil_param_to_host_reg(op.rs2, xmm0); else { -#ifdef EXPLODE_SPANS - if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1)) + if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2)) { mov(xmm0, regalloc.MapXRegister(op.rs2, 0)); mov(xmm1, regalloc.MapXRegister(op.rs2, 1)); } else -#endif { movd(xmm0, dword[op.rs2.reg_ptr()]); movd(xmm1, dword[op.rs2.reg_ptr() + 1]); diff --git a/core/rec-x86/x86_regalloc.h b/core/rec-x86/x86_regalloc.h index d0ba848b1..1b7990a70 100644 --- a/core/rec-x86/x86_regalloc.h +++ b/core/rec-x86/x86_regalloc.h @@ -19,9 +19,11 @@ #pragma once #include "hw/sh4/dyna/ssa_regalloc.h" +#define ALLOC_F64 false + class X86Compiler; -struct X86RegAlloc : RegAlloc +struct X86RegAlloc : RegAlloc { X86RegAlloc(X86Compiler *compiler) : compiler(compiler) {} @@ -40,9 +42,9 @@ struct X86RegAlloc : RegAlloc return Xbyak::Reg32(ereg); } - Xbyak::Xmm MapXRegister(const shil_param& param) + Xbyak::Xmm MapXRegister(const shil_param& param, int index = 0) { - s8 ereg = mapf(param); + s8 ereg = mapf(param, index); if (ereg == -1) die("VRegister not allocated"); return Xbyak::Xmm(ereg);