diff --git a/core/hw/mem/_vmem.cpp b/core/hw/mem/_vmem.cpp index 7e4172ef1..9585dd657 100644 --- a/core/hw/mem/_vmem.cpp +++ b/core/hw/mem/_vmem.cpp @@ -97,6 +97,46 @@ void* _vmem_read_const(u32 addr,bool& ismem,u32 sz) return 0; } +void* _vmem_write_const(u32 addr,bool& ismem,u32 sz) +{ + u32 page=addr>>24; + unat iirf=(unat)_vmem_MemInfo_ptr[page]; + void* ptr=(void*)(iirf&~HANDLER_MAX); + + if (ptr==0) + { + ismem=false; + const unat id=iirf; + if (sz==1) + { + return (void*)_vmem_WF8[id/4]; + } + else if (sz==2) + { + return (void*)_vmem_WF16[id/4]; + } + else if (sz==4) + { + return (void*)_vmem_WF32[id/4]; + } + else + { + die("Invalid size"); + } + } + else + { + ismem=true; + addr<<=iirf; + addr>>=iirf; + + return &(((u8*)ptr)[addr]); + } + die("Invalid memory size"); + + return 0; +} + void* _vmem_page_info(u32 addr,bool& ismem,u32 sz,u32& page_sz,bool rw) { u32 page=addr>>24; diff --git a/core/hw/mem/_vmem.h b/core/hw/mem/_vmem.h index 5ea610c68..f93d4842b 100644 --- a/core/hw/mem/_vmem.h +++ b/core/hw/mem/_vmem.h @@ -100,6 +100,7 @@ void _vmem_release(); void _vmem_get_ptrs(u32 sz,bool write,void*** vmap,void*** func); void* _vmem_get_ptr2(u32 addr,u32& mask); void* _vmem_read_const(u32 addr,bool& ismem,u32 sz); +void* _vmem_write_const(u32 addr,bool& ismem,u32 sz); extern u8* virt_ram_base; extern bool vmem_4gb_space; diff --git a/core/hw/sh4/dyna/shil.cpp b/core/hw/sh4/dyna/shil.cpp index 2fe2f4e6d..47b963b4c 100644 --- a/core/hw/sh4/dyna/shil.cpp +++ b/core/hw/sh4/dyna/shil.cpp @@ -824,7 +824,7 @@ void constlink(RuntimeBlockInfo* blk) else if (def==NoReg && op->rs1.is_imm() && op->rs1._imm==0) { //def=op->rd._reg; - val=op->rs1._imm; + val = op->rs1._imm; } } } @@ -892,8 +892,9 @@ void srt_waw(RuntimeBlockInfo* blk) //Seems to be working void AnalyseBlock(RuntimeBlockInfo* blk) { - //SSAOptimizer optim(blk); - //optim.Optimize(); + SSAOptimizer optim(blk); + optim.Optimize(); + return; u32 st[sh4_reg_count]={0}; /* diff --git a/core/hw/sh4/dyna/ssa.cpp b/core/hw/sh4/dyna/ssa.cpp index 6ba3000c5..b40e9f2fd 100644 --- a/core/hw/sh4/dyna/ssa.cpp +++ b/core/hw/sh4/dyna/ssa.cpp @@ -190,6 +190,8 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode& op) case shop_setae: rd = rs1 >= rs2; break; + default: + break; } } break; diff --git a/core/hw/sh4/dyna/ssa.h b/core/hw/sh4/dyna/ssa.h index d454732c6..8dd2b287e 100644 --- a/core/hw/sh4/dyna/ssa.h +++ b/core/hw/sh4/dyna/ssa.h @@ -42,7 +42,7 @@ public: ConstPropPass(); DeadCodeRemovalPass(); - ConstantExpressionsPass(); + SimplifyExpressionPass(); CombineShiftsPass(); DeadRegisterPass(); IdentityMovePass(); @@ -355,7 +355,7 @@ private: } } - void ConstantExpressionsPass() + void SimplifyExpressionPass() { for (int opnum = 0; opnum < block->oplist.size(); opnum++) { @@ -404,11 +404,12 @@ private: continue; } } - // Not sure it's worth the trouble, except for the xor perhaps + // Not sure it's worth the trouble, except for the 'and' and 'xor' else if (op.rs1.is_r32i() && op.rs1._reg == op.rs2._reg) { // a ^ a == 0 - if (op.op == shop_xor) + // a - a == 0 + if (op.op == shop_xor || op.op == shop_sub) { //printf("%08x ZERO %s\n", block->vaddr + op.guest_offs, op.dissasm().c_str()); ReplaceByMov32(op, 0); @@ -420,6 +421,14 @@ private: //printf("%08x IDEN %s\n", block->vaddr + op.guest_offs, op.dissasm().c_str()); ReplaceByMov32(op); } + // a + a == a * 2 == a << 1 + else if (op.op == shop_add) + { + // There's quite a few of these + //printf("%08x +t<< %s\n", block->vaddr + op.guest_offs, op.dissasm().c_str()); + op.op = shop_shl; + op.rs2 = shil_param(FMT_IMM, 1); + } } } } diff --git a/core/hw/sh4/dyna/ssa_regalloc.h b/core/hw/sh4/dyna/ssa_regalloc.h index 6b12f6ed7..509e2f767 100644 --- a/core/hw/sh4/dyna/ssa_regalloc.h +++ b/core/hw/sh4/dyna/ssa_regalloc.h @@ -247,11 +247,9 @@ public: virtual void Preload(u32 reg, nreg_t nreg) = 0; virtual void Writeback(u32 reg, nreg_t nreg) = 0; - virtual void CheckReg(u32 reg, nreg_t nreg) = 0; virtual void Preload_FPU(u32 reg, nregf_t nreg) = 0; virtual void Writeback_FPU(u32 reg, nregf_t nreg) = 0; - virtual void CheckReg_FPU(u32 reg, nregf_t nreg) = 0; private: struct reg_alloc { diff --git a/core/rec-ARM64/arm64_regalloc.h b/core/rec-ARM64/arm64_regalloc.h index 63a23545f..92404a345 100644 --- a/core/rec-ARM64/arm64_regalloc.h +++ b/core/rec-ARM64/arm64_regalloc.h @@ -20,8 +20,11 @@ #ifndef CORE_REC_ARM64_ARM64_REGALLOC_H_ #define CORE_REC_ARM64_ARM64_REGALLOC_H_ - +#ifdef OLD_REGALLOC #include "hw/sh4/dyna/regalloc.h" +#else +#include "hw/sh4/dyna/ssa_regalloc.h" +#endif #include "deps/vixl/aarch64/macro-assembler-aarch64.h" using namespace vixl::aarch64; @@ -67,7 +70,15 @@ struct Arm64RegAlloc : RegAlloc(op.rs1._imm)); else if (regalloc.IsAllocf(op.rs1)) Fmov(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1)); else @@ -483,9 +483,13 @@ public: break; case shop_swaplb: - Mov(w9, Operand(regalloc.MapRegister(op.rs1), LSR, 16)); - Rev16(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); - Bfi(regalloc.MapRegister(op.rd), w9, 16, 16); + { + const Register rs1 = regalloc.MapRegister(op.rs1); + const Register rd = regalloc.MapRegister(op.rd); + Mov(w9, Operand(rs1, LSR, 16)); + Rev16(rd, rs1); + Bfi(rd, w9, 16, 16); + } break; case shop_neg: @@ -536,60 +540,182 @@ public: break; case shop_adc: - Cmp(regalloc.MapRegister(op.rs3), 1); // C = rs3 - Adcs(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); // (C,rd)=rs1+rs2+rs3(C) - Cset(regalloc.MapRegister(op.rd2), cs); // rd2 = C + { + Register reg1; + Operand op2; + Register reg3; + if (op.rs1.is_imm()) + { + Mov(w0, op.rs1.imm_value()); + reg1 = w0; + } + else + { + reg1 = regalloc.MapRegister(op.rs1); + } + if (op.rs2.is_imm()) + op2 = Operand(op.rs2.imm_value()); + else + op2 = regalloc.MapRegister(op.rs2); + if (op.rs3.is_imm()) + { + Mov(w1, op.rs3.imm_value()); + reg3 = w1; + } + else + { + reg3 = regalloc.MapRegister(op.rs3); + } + Cmp(reg3, 1); // C = rs3 + Adcs(regalloc.MapRegister(op.rd), reg1, op2); // (C,rd)=rs1+rs2+rs3(C) + Cset(regalloc.MapRegister(op.rd2), cs); // rd2 = C + } break; case shop_sbc: - Cmp(wzr, regalloc.MapRegister(op.rs3)); // C = ~rs3 - Sbcs(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); // (C,rd) = rs1 - rs2 - ~rs3(C) - Cset(regalloc.MapRegister(op.rd2), cc); // rd2 = ~C + { + Register reg1; + Operand op2; + Operand op3; + if (op.rs1.is_imm()) + { + Mov(w0, op.rs1.imm_value()); + reg1 = w0; + } + else + { + reg1 = regalloc.MapRegister(op.rs1); + } + if (op.rs2.is_imm()) + op2 = Operand(op.rs2.imm_value()); + else + op2 = regalloc.MapRegister(op.rs2); + if (op.rs3.is_imm()) + op3 = Operand(op.rs3.imm_value()); + else + op3 = regalloc.MapRegister(op.rs3); + Cmp(wzr, op3); // C = ~rs3 + Sbcs(regalloc.MapRegister(op.rd), reg1, op2); // (C,rd) = rs1 - rs2 - ~rs3(C) + Cset(regalloc.MapRegister(op.rd2), cc); // rd2 = ~C + } break; case shop_negc: - Cmp(wzr, regalloc.MapRegister(op.rs2)); // C = ~rs2 - Sbcs(regalloc.MapRegister(op.rd), wzr, regalloc.MapRegister(op.rs1)); // (C,rd) = 0 - rs1 - ~rs2(C) - Cset(regalloc.MapRegister(op.rd2), cc); // rd2 = ~C + { + Operand op1; + Operand op2; + if (op.rs1.is_imm()) + op1 = Operand(op.rs1.imm_value()); + else + op1 = regalloc.MapRegister(op.rs1); + if (op.rs2.is_imm()) + op2 = Operand(op.rs2.imm_value()); + else + op2 = regalloc.MapRegister(op.rs2); + Cmp(wzr, op2); // C = ~rs2 + Sbcs(regalloc.MapRegister(op.rd), wzr, op1); // (C,rd) = 0 - rs1 - ~rs2(C) + Cset(regalloc.MapRegister(op.rd2), cc); // rd2 = ~C + } break; case shop_rocr: - Ubfx(w0, regalloc.MapRegister(op.rs1), 0, 1); // w0 = rs1[0] (new C) - Mov(regalloc.MapRegister(op.rd), Operand(regalloc.MapRegister(op.rs1), LSR, 1)); // rd = rs1 >> 1 - Bfi(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs2), 31, 1); // rd |= C << 31 - Mov(regalloc.MapRegister(op.rd2), w0); // rd2 = w0 (new C) + { + Register reg1; + Register reg2; + if (op.rs1.is_imm()) + { + Mov(w1, op.rs1.imm_value()); + reg1 = w1; + } + else + { + reg1 = regalloc.MapRegister(op.rs1); + } + if (op.rs2.is_imm()) + { + Mov(w2, op.rs2.imm_value()); + reg2 = w2; + } + else + { + reg2 = regalloc.MapRegister(op.rs2); + } + Ubfx(w0, reg1, 0, 1); // w0 = rs1[0] (new C) + const Register rd = regalloc.MapRegister(op.rd); + Mov(rd, Operand(reg1, LSR, 1)); // rd = rs1 >> 1 + Bfi(rd, reg2, 31, 1); // rd |= C << 31 + Mov(regalloc.MapRegister(op.rd2), w0); // rd2 = w0 (new C) + } break; case shop_rocl: - Tst(regalloc.MapRegister(op.rs1), 0x80000000); // Z = ~rs1[31] - Orr(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs2), Operand(regalloc.MapRegister(op.rs1), LSL, 1)); // rd = rs1 << 1 | rs2(C) - Cset(regalloc.MapRegister(op.rd2), ne); // rd2 = ~Z(C) + { + Register reg1; + Register reg2; + if (op.rs1.is_imm()) + { + Mov(w0, op.rs1.imm_value()); + reg1 = w0; + } + else + { + reg1 = regalloc.MapRegister(op.rs1); + } + if (op.rs2.is_imm()) + { + Mov(w1, op.rs2.imm_value()); + reg2 = w1; + } + else + { + reg2 = regalloc.MapRegister(op.rs2); + } + Tst(reg1, 0x80000000); // Z = ~rs1[31] + Orr(regalloc.MapRegister(op.rd), reg2, Operand(reg1, LSL, 1)); // rd = rs1 << 1 | rs2(C) + Cset(regalloc.MapRegister(op.rd2), ne); // rd2 = ~Z(C) + } break; case shop_shld: case shop_shad: { + Register reg1; + if (op.rs1.is_imm()) + { + Mov(w0, op.rs1.imm_value()); + reg1 = w0; + } + else + { + reg1 = regalloc.MapRegister(op.rs1); + } Label positive_shift, negative_shift, end; - Tbz(regalloc.MapRegister(op.rs2), 31, &positive_shift); - Cmn(regalloc.MapRegister(op.rs2), 32); + const Register rs2 = regalloc.MapRegister(op.rs2); + Tbz(rs2, 31, &positive_shift); + Cmn(rs2, 32); B(&negative_shift, ne); + const Register rd = regalloc.MapRegister(op.rd); + // rs2 == -32 => rd = 0 (logical) or 0/-1 (arith) if (op.op == shop_shld) // Logical shift - Lsr(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rd), 31); + //Lsr(rd, reg1, 31); + Mov(rd, wzr); else // Arithmetic shift - Asr(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), 31); + Asr(rd, reg1, 31); B(&end); Bind(&positive_shift); - Lsl(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); + // rs2 >= 0 => left shift + Lsl(rd, reg1, rs2); B(&end); Bind(&negative_shift); - Neg(w1, regalloc.MapRegister(op.rs2)); + // rs2 < 0 => right shift + Neg(w1, rs2); if (op.op == shop_shld) // Logical shift - Lsr(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rd), w1); + Lsr(rd, reg1, w1); else // Arithmetic shift - Asr(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), w1); + Asr(rd, reg1, w1); Bind(&end); } break; @@ -601,19 +727,20 @@ public: case shop_setae: case shop_setab: { + const Register rs1 = regalloc.MapRegister(op.rs1); if (op.op == shop_test) { if (op.rs2.is_imm()) - Tst(regalloc.MapRegister(op.rs1), op.rs2._imm); + Tst(rs1, op.rs2._imm); else - Tst(regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); + Tst(rs1, regalloc.MapRegister(op.rs2)); } else { if (op.rs2.is_imm()) - Cmp(regalloc.MapRegister(op.rs1), op.rs2._imm); + Cmp(rs1, op.rs2._imm); else - Cmp(regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); + Cmp(rs1, regalloc.MapRegister(op.rs2)); } static const Condition shop_conditions[] = { eq, eq, ge, gt, hs, hi }; @@ -622,32 +749,90 @@ public: } break; case shop_setpeq: - Eor(w1, regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); - - Mov(regalloc.MapRegister(op.rd), wzr); - Mov(w2, wzr); // wzr not supported by csinc (?!) - Tst(w1, 0xFF000000); - Csinc(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rd), w2, ne); - Tst(w1, 0x00FF0000); - Csinc(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rd), w2, ne); - Tst(w1, 0x0000FF00); - Csinc(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rd), w2, ne); - Tst(w1, 0x000000FF); - Csinc(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rd), w2, ne); + { + Register reg1; + Register reg2; + if (op.rs1.is_imm()) + { + Mov(w0, op.rs1.imm_value()); + reg1 = w0; + } + else + { + reg1 = regalloc.MapRegister(op.rs1); + } + if (op.rs2.is_imm()) + { + Mov(w1, op.rs2.imm_value()); + reg2 = w1; + } + else + { + reg2 = regalloc.MapRegister(op.rs2); + } + Eor(w1, reg1, reg2); + const Register rd = regalloc.MapRegister(op.rd); + Mov(rd, wzr); + Mov(w2, wzr); // wzr not supported by csinc (?!) + Tst(w1, 0xFF000000); + Csinc(rd, rd, w2, ne); + Tst(w1, 0x00FF0000); + Csinc(rd, rd, w2, ne); + Tst(w1, 0x0000FF00); + Csinc(rd, rd, w2, ne); + Tst(w1, 0x000000FF); + Csinc(rd, rd, w2, ne); + } break; case shop_mul_u16: - Uxth(w10, regalloc.MapRegister(op.rs1)); - Uxth(w11, regalloc.MapRegister(op.rs2)); - Mul(regalloc.MapRegister(op.rd), w10, w11); + { + Register reg2; + if (op.rs2.is_imm()) + { + Mov(w0, op.rs2.imm_value()); + reg2 = w0; + } + else + { + reg2 = regalloc.MapRegister(op.rs2); + } + Uxth(w10, regalloc.MapRegister(op.rs1)); + Uxth(w11, reg2); + Mul(regalloc.MapRegister(op.rd), w10, w11); + } break; case shop_mul_s16: - Sxth(w10, regalloc.MapRegister(op.rs1)); - Sxth(w11, regalloc.MapRegister(op.rs2)); - Mul(regalloc.MapRegister(op.rd), w10, w11); + { + Register reg2; + if (op.rs2.is_imm()) + { + Mov(w0, op.rs2.imm_value()); + reg2 = w0; + } + else + { + reg2 = regalloc.MapRegister(op.rs2); + } + Sxth(w10, regalloc.MapRegister(op.rs1)); + Sxth(w11, reg2); + Mul(regalloc.MapRegister(op.rd), w10, w11); + } break; case shop_mul_i32: - Mul(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); + { + Register reg2; + if (op.rs2.is_imm()) + { + Mov(w0, op.rs2.imm_value()); + reg2 = w0; + } + else + { + reg2 = regalloc.MapRegister(op.rs2); + } + Mul(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), reg2); + } break; case shop_mul_u64: case shop_mul_s64: @@ -709,9 +894,12 @@ public: break; case shop_xtrct: - Lsr(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), 16); - Lsl(w0, regalloc.MapRegister(op.rs2), 16); - Orr(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rd), w0); + { + const Register rd = regalloc.MapRegister(op.rd); + Lsr(rd, regalloc.MapRegister(op.rs1), 16); + Lsl(w0, regalloc.MapRegister(op.rs2), 16); + Orr(rd, rd, w0); + } break; // @@ -719,16 +907,104 @@ public: // case shop_fadd: - Fadd(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1), regalloc.MapVRegister(op.rs2)); + { + VRegister reg1; + VRegister reg2; + if (op.rs1.is_imm()) + { + Fmov(s0, reinterpret_cast(op.rs1._imm)); + reg1 = s0; + } + else + { + reg1 = regalloc.MapVRegister(op.rs1); + } + if (op.rs2.is_imm()) + { + Fmov(s1, reinterpret_cast(op.rs2._imm)); + reg2 = s1; + } + else + { + reg2 = regalloc.MapVRegister(op.rs2); + } + Fadd(regalloc.MapVRegister(op.rd), reg1, reg2); + } break; case shop_fsub: - Fsub(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1), regalloc.MapVRegister(op.rs2)); + { + VRegister reg1; + VRegister reg2; + if (op.rs1.is_imm()) + { + Fmov(s0, reinterpret_cast(op.rs1._imm)); + reg1 = s0; + } + else + { + reg1 = regalloc.MapVRegister(op.rs1); + } + if (op.rs2.is_imm()) + { + Fmov(s1, reinterpret_cast(op.rs2._imm)); + reg2 = s1; + } + else + { + reg2 = regalloc.MapVRegister(op.rs2); + } + Fsub(regalloc.MapVRegister(op.rd), reg1, reg2); + } break; case shop_fmul: - Fmul(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1), regalloc.MapVRegister(op.rs2)); + { + VRegister reg1; + VRegister reg2; + if (op.rs1.is_imm()) + { + Fmov(s0, reinterpret_cast(op.rs1._imm)); + reg1 = s0; + } + else + { + reg1 = regalloc.MapVRegister(op.rs1); + } + if (op.rs2.is_imm()) + { + Fmov(s1, reinterpret_cast(op.rs2._imm)); + reg2 = s1; + } + else + { + reg2 = regalloc.MapVRegister(op.rs2); + } + Fmul(regalloc.MapVRegister(op.rd), reg1, reg2); + } break; case shop_fdiv: - Fdiv(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1), regalloc.MapVRegister(op.rs2)); + { + VRegister reg1; + VRegister reg2; + if (op.rs1.is_imm()) + { + Fmov(s0, reinterpret_cast(op.rs1._imm)); + reg1 = s0; + } + else + { + reg1 = regalloc.MapVRegister(op.rs1); + } + if (op.rs2.is_imm()) + { + Fmov(s1, reinterpret_cast(op.rs2._imm)); + reg2 = s1; + } + else + { + reg2 = regalloc.MapVRegister(op.rs2); + } + Fdiv(regalloc.MapVRegister(op.rd), reg1, reg2); + } break; case shop_fabs: @@ -888,12 +1164,12 @@ public: break; case CPT_f32: - if (prm.is_reg()) { + if (prm.is_reg()) Fmov(*call_fregs[fregused], regalloc.MapVRegister(prm)); - } - else { + else if (prm.is_imm()) + Fmov(*call_fregs[fregused], reinterpret_cast(prm._imm)); + else verify(prm.is_null()); - } fregused++; break; @@ -1033,7 +1309,7 @@ public: if (block->oplist[opid].op == shop_readm) { regalloc.DoAlloc(block); - regalloc.current_opid = opid; + regalloc.SetOpnum(opid); } } @@ -1181,9 +1457,8 @@ public: vmem_platform_flush_cache( CC_RW2RX(GetBuffer()->GetStartAddress()), CC_RW2RX(GetBuffer()->GetEndAddress()), GetBuffer()->GetStartAddress(), GetBuffer()->GetEndAddress()); - #if 0 -// if (rewrite) + if (rewrite && block != NULL) { Instruction* instr_start = (Instruction*)block->code; // Instruction* instr_end = GetLabelAddress(&code_end); @@ -1432,6 +1707,10 @@ private: { switch (size) { + case 1: + Ldrsb(regalloc.MapRegister(op.rd), MemOperand(x1)); + break; + case 2: Ldrsh(regalloc.MapRegister(op.rd), MemOperand(x1)); break; @@ -1452,6 +1731,10 @@ private: { switch (size) { + case 1: + Ldrsb(w1, MemOperand(x1)); + break; + case 2: Ldrsh(w1, MemOperand(x1)); break; @@ -1460,11 +1743,18 @@ private: Ldr(w1, MemOperand(x1)); break; + case 8: + Ldr(x1, MemOperand(x1)); + break; + default: die("Invalid size"); break; } - Str(w1, sh4_context_mem_operand(op.rd.reg_ptr())); + if (size == 8) + Str(x1, sh4_context_mem_operand(op.rd.reg_ptr())); + else + Str(w1, sh4_context_mem_operand(op.rd.reg_ptr())); } } else @@ -1568,7 +1858,7 @@ private: Lsr(x1, x1, 32); Fmov(regalloc.MapVRegister(op.rd, 1), w1); #else - Str(x1, sh4_context_mem_operand(op.rd.reg_ptr())); + die("GenReadMemoryFast: size == 8 and !explode_spans"); #endif } } @@ -1604,6 +1894,9 @@ private: void GenWriteMemory(const shil_opcode& op, size_t opid, bool optimise) { + if (GenWriteMemoryImmediate(op)) + return; + GenMemAddr(op, call_regs[0]); u32 size = op.flags & 0x7f; @@ -1627,6 +1920,111 @@ private: GenWriteMemorySlow(op); } + bool GenWriteMemoryImmediate(const shil_opcode& op) + { + if (!op.rs1.is_imm()) + return false; + + u32 size = op.flags & 0x7f; + u32 addr = op.rs1._imm; + if (mmu_enabled()) + { + if ((addr >> 12) != (block->vaddr >> 12)) + // When full mmu is on, only consider addresses in the same 4k page + return false; + u32 paddr; + u32 rv; + switch (size) + { + case 1: + rv = mmu_data_translation(addr, paddr); + break; + case 2: + rv = mmu_data_translation(addr, paddr); + break; + case 4: + case 8: + rv = mmu_data_translation(addr, paddr); + break; + } + if (rv != MMU_ERROR_NONE) + return false; + addr = paddr; + } + bool isram = false; + void* ptr = _vmem_write_const(addr, isram, size); + + Register reg2; + if (op.rs2.is_imm()) + { + Mov(w0, op.rs2._imm); + reg2 = w0; + } + else if (regalloc.IsAllocg(op.rs2)) + { + reg2 = regalloc.MapRegister(op.rs2); + } + else if (regalloc.IsAllocf(op.rs2)) + { + Fmov(w0, regalloc.MapVRegister(op.rs2)); + reg2 = w0; + } + else + die("Invalid rs2 param"); + if (isram) + { + Ldr(x1, reinterpret_cast(ptr)); + switch (size) + { + case 1: + Strb(reg2, MemOperand(x1)); + break; + + case 2: + Strh(reg2, MemOperand(x1)); + break; + + case 4: + if (op.rs2.is_r32f()) + Str(reg2, MemOperand(x1)); + else + Str(reg2, MemOperand(x1)); + break; + + default: + die("Invalid size"); + break; + } + } + else + { + // Not RAM + Mov(w1, reg2); + Mov(w0, addr); + + switch(size) + { + case 1: + GenCallRuntime((void (*)())ptr); + break; + + case 2: + GenCallRuntime((void (*)())ptr); + break; + + case 4: + GenCallRuntime((void (*)())ptr); + break; + + case 8: + die("SZ_64F not supported"); + break; + } + } + + return true; + } + bool GenWriteMemoryFast(const shil_opcode& op, size_t opid) { // Direct memory access. Need to handle SIGSEGV and rewrite block as needed. See ngen_Rewrite() diff --git a/core/rec-x64/rec_x64.cpp b/core/rec-x64/rec_x64.cpp index 4c7db5854..ed73b74d3 100644 --- a/core/rec-x64/rec_x64.cpp +++ b/core/rec-x64/rec_x64.cpp @@ -3,8 +3,9 @@ #if FEAT_SHREC == DYNAREC_JIT && HOST_CPU == CPU_X64 #include -#define EXPLODE_SPANS +//#define EXPLODE_SPANS //#define PROFILING +//#define CANONICAL_TEST #include "deps/xbyak/xbyak.h" #include "deps/xbyak/xbyak_util.h" @@ -40,23 +41,32 @@ extern "C" { int cycle_counter; } -double host_cpu_time; -u64 guest_cpu_cycles; +u64 host_cpu_time; u32 mem_writes, mem_reads; u32 mem_rewrites_w, mem_rewrites_r; #ifdef PROFILING -static double slice_start; +static clock_t slice_start; +int start_cycle; extern "C" { -static __attribute((used)) void start_slice() +static __attribute((used)) void* start_slice(void *p) { - slice_start = os_GetSeconds(); + slice_start = clock(); + start_cycle = cycle_counter; + return p; } static __attribute((used)) void end_slice() { - host_cpu_time += os_GetSeconds() - slice_start; + clock_t now = clock(); + if (slice_start != 0) + { + host_cpu_time += now - slice_start; + guest_cpu_cycles += start_cycle - cycle_counter; + } + slice_start = now; + start_cycle = cycle_counter; } } #endif @@ -158,15 +168,15 @@ WIN32_ONLY( ".seh_pushreg %r14 \n\t") #endif "call " _U "bm_GetCodeByVAddr \n\t" "call *%rax \n\t" +#ifdef PROFILING + "call end_slice \n\t" +#endif "movl " _U "cycle_counter(%rip), %ecx \n\t" "testl %ecx, %ecx \n\t" "jg 2b \n\t" // slice_loop "addl $" _S(SH4_TIMESLICE) ", %ecx \n\t" "movl %ecx, " _U "cycle_counter(%rip) \n\t" -#ifdef PROFILING - "call end_slice \n\t" -#endif "call " _U "UpdateSystem_INTC \n\t" "jmp 1b \n" // run_loop @@ -371,11 +381,6 @@ public: sub(dword[rax], block->guest_cycles); #else sub(dword[rip + &cycle_counter], block->guest_cycles); -#endif -#ifdef PROFILING - mov(rax, (uintptr_t)&guest_cpu_cycles); - mov(ecx, block->guest_cycles); - add(qword[rax], rcx); #endif regalloc.DoAlloc(block); @@ -412,14 +417,12 @@ public: case shop_jcond: case shop_jdyn: { + Xbyak::Reg32 rd = regalloc.MapRegister(op.rd); + Xbyak::Reg32 rs1 = regalloc.MapRegister(op.rs1); + if (rd != rs1) + mov(rd, rs1); if (op.rs2.is_imm()) - { - mov(ecx, regalloc.MapRegister(op.rs1)); - add(ecx, op.rs2._imm); - mov(regalloc.MapRegister(op.rd), ecx); - } - else - mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); + add(rd, op.rs2._imm); } break; @@ -495,41 +498,44 @@ public: case shop_writem: { - shil_param_to_host_reg(op.rs1, call_regs[0]); - if (!op.rs3.is_null()) + if (!GenWriteMemImmediate(op, block)) { - if (op.rs3.is_imm()) - add(call_regs[0], op.rs3._imm); - else if (regalloc.IsAllocg(op.rs3)) - add(call_regs[0], regalloc.MapRegister(op.rs3)); - else + shil_param_to_host_reg(op.rs1, call_regs[0]); + if (!op.rs3.is_null()) { - mov(rax, (uintptr_t)op.rs3.reg_ptr()); - add(call_regs[0], dword[rax]); + if (op.rs3.is_imm()) + add(call_regs[0], op.rs3._imm); + else if (regalloc.IsAllocg(op.rs3)) + add(call_regs[0], regalloc.MapRegister(op.rs3)); + else + { + mov(rax, (uintptr_t)op.rs3.reg_ptr()); + add(call_regs[0], dword[rax]); + } } - } - u32 size = op.flags & 0x7f; - if (size != 8) - shil_param_to_host_reg(op.rs2, call_regs[1]); - else { + u32 size = op.flags & 0x7f; + if (size != 8) + shil_param_to_host_reg(op.rs2, call_regs[1]); + else { #ifdef EXPLODE_SPANS - if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1)) - { - movd(call_regs[1], regalloc.MapXRegister(op.rs2, 1)); - shl(call_regs64[1], 32); - movd(eax, regalloc.MapXRegister(op.rs2, 0)); - or_(call_regs64[1], rax); - } - else + if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1)) + { + movd(call_regs[1], regalloc.MapXRegister(op.rs2, 1)); + shl(call_regs64[1], 32); + movd(eax, regalloc.MapXRegister(op.rs2, 0)); + or_(call_regs64[1], rax); + } + else #endif - { - mov(rax, (uintptr_t)op.rs2.reg_ptr()); - mov(call_regs64[1], qword[rax]); + { + mov(rax, (uintptr_t)op.rs2.reg_ptr()); + mov(call_regs64[1], qword[rax]); + } } + if (!optimise || !GenWriteMemoryFast(op, block)) + GenWriteMemorySlow(op, block); } - if (!optimise || !GenWriteMemoryFast(op, block)) - GenWriteMemorySlow(op, block); } break; @@ -544,7 +550,7 @@ public: case shop_swaplb: if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); - ror(Xbyak::Reg16(regalloc.MapRegister(op.rd).getIdx()), 8); + ror(regalloc.MapRegister(op.rd).cvt16(), 8); break; case shop_neg: @@ -595,12 +601,30 @@ public: break; case shop_adc: - if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) - mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); - cmp(regalloc.MapRegister(op.rs3), 1); // C = ~rs3 - cmc(); // C = rs3 - adc(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs2)); // (C,rd)=rs1+rs2+rs3(C) - setc(regalloc.MapRegister(op.rd2).cvt8()); // rd2 = C + { + cmp(regalloc.MapRegister(op.rs3), 1); // C = ~rs3 + Xbyak::Reg32 rs2; + Xbyak::Reg32 rd = regalloc.MapRegister(op.rd); + if (op.rs2.is_reg()) + { + rs2 = regalloc.MapRegister(op.rs2); + if (regalloc.mapg(op.rd) == regalloc.mapg(op.rs2)) + { + mov(ecx, rs2); + rs2 = ecx; + } + } + if (op.rs1.is_imm()) + mov(rd, op.rs1.imm_value()); + else if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) + mov(rd, regalloc.MapRegister(op.rs1)); + cmc(); // C = rs3 + if (op.rs2.is_reg()) + adc(rd, rs2); // (C,rd)=rs1+rs2+rs3(C) + else + adc(rd, op.rs2.imm_value()); + setc(regalloc.MapRegister(op.rd2).cvt8()); // rd2 = C + } break; /* FIXME buggy @@ -619,11 +643,27 @@ public: */ case shop_negc: { - if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) - mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); - Xbyak::Reg64 rd64 = regalloc.MapRegister(op.rd).cvt64(); + Xbyak::Reg32 rs2; + if (op.rs2.is_reg()) + { + rs2 = regalloc.MapRegister(op.rs2); + if (regalloc.mapg(op.rd) == regalloc.mapg(op.rs2)) + { + mov(ecx, rs2); + rs2 = ecx; + } + } + Xbyak::Reg32 rd = regalloc.MapRegister(op.rd); + if (op.rs1.is_imm()) + mov(rd, op.rs1.imm_value()); + else if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) + mov(rd, regalloc.MapRegister(op.rs1)); + Xbyak::Reg64 rd64 = rd.cvt64(); neg(rd64); - sub(rd64, regalloc.MapRegister(op.rs2).cvt64()); + if (op.rs2.is_imm()) + sub(rd64, op.rs2.imm_value()); + else + sub(rd64, rs2.cvt64()); Xbyak::Reg64 rd2_64 = regalloc.MapRegister(op.rd2).cvt64(); mov(rd2_64, rd64); shr(rd2_64, 63); @@ -632,48 +672,60 @@ public: case shop_rocr: case shop_rocl: - if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) - mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); - cmp(regalloc.MapRegister(op.rs2), 1); // C = ~rs2 - cmc(); // C = rs2 - if (op.op == shop_rocr) - rcr(regalloc.MapRegister(op.rd), 1); - else - rcl(regalloc.MapRegister(op.rd), 1); - setc(al); - movzx(regalloc.MapRegister(op.rd2), al); // rd2 = C + { + Xbyak::Reg32 rd = regalloc.MapRegister(op.rd); + cmp(regalloc.MapRegister(op.rs2), 1); // C = ~rs2 + if (op.rs1.is_imm()) + mov(rd, op.rs1.imm_value()); + else if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) + mov(rd, regalloc.MapRegister(op.rs1)); + cmc(); // C = rs2 + if (op.op == shop_rocr) + rcr(rd, 1); + else + rcl(rd, 1); + setc(al); + movzx(regalloc.MapRegister(op.rd2), al); // rd2 = C + } break; case shop_shld: case shop_shad: { - if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) - mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); + if (op.rs2.is_reg()) + mov(ecx, regalloc.MapRegister(op.rs2)); + else + // This shouldn't happen. If arg is imm -> shop_shl/shr/sar + mov(ecx, op.rs2.imm_value()); + Xbyak::Reg32 rd = regalloc.MapRegister(op.rd); + if (op.rs1.is_imm()) + mov(rd, op.rs1.imm_value()); + else if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) + mov(rd, regalloc.MapRegister(op.rs1)); Xbyak::Label negative_shift; Xbyak::Label non_zero; Xbyak::Label exit; - mov(ecx, regalloc.MapRegister(op.rs2)); cmp(ecx, 0); js(negative_shift); - shl(regalloc.MapRegister(op.rd), cl); + shl(rd, cl); jmp(exit); L(negative_shift); test(ecx, 0x1f); jnz(non_zero); if (op.op == shop_shld) - xor_(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rd)); + xor_(rd, rd); else - sar(regalloc.MapRegister(op.rd), 31); + sar(rd, 31); jmp(exit); L(non_zero); neg(ecx); if (op.op == shop_shld) - shr(regalloc.MapRegister(op.rd), cl); + shr(rd, cl); else - sar(regalloc.MapRegister(op.rd), cl); + sar(rd, cl); L(exit); } break; @@ -730,25 +782,40 @@ public: break; */ case shop_mul_u16: - movzx(eax, Xbyak::Reg16(regalloc.MapRegister(op.rs1).getIdx())); - movzx(ecx, Xbyak::Reg16(regalloc.MapRegister(op.rs2).getIdx())); + movzx(eax, regalloc.MapRegister(op.rs1).cvt16()); + if (op.rs2.is_reg()) + movzx(ecx, regalloc.MapRegister(op.rs2).cvt16()); + else + mov(ecx, op.rs2._imm & 0xFFFF); mul(ecx); mov(regalloc.MapRegister(op.rd), eax); break; case shop_mul_s16: - movsx(eax, Xbyak::Reg16(regalloc.MapRegister(op.rs1).getIdx())); - movsx(ecx, Xbyak::Reg16(regalloc.MapRegister(op.rs2).getIdx())); + movsx(eax, regalloc.MapRegister(op.rs1).cvt16()); + if (op.rs2.is_reg()) + movsx(ecx, regalloc.MapRegister(op.rs2).cvt16()); + else + mov(ecx, (s32)(s16)op.rs2._imm); mul(ecx); mov(regalloc.MapRegister(op.rd), eax); break; case shop_mul_i32: mov(eax, regalloc.MapRegister(op.rs1)); - mul(regalloc.MapRegister(op.rs2)); + if (op.rs2.is_reg()) + mul(regalloc.MapRegister(op.rs2)); + else + { + mov(ecx, op.rs2._imm); + mul(ecx); + } mov(regalloc.MapRegister(op.rd), eax); break; case shop_mul_u64: mov(eax, regalloc.MapRegister(op.rs1)); - mov(ecx, regalloc.MapRegister(op.rs2)); + if (op.rs2.is_reg()) + mov(ecx, regalloc.MapRegister(op.rs2)); + else + mov(ecx, op.rs2._imm); mul(rcx); mov(regalloc.MapRegister(op.rd), eax); shr(rax, 32); @@ -756,7 +823,10 @@ public: break; case shop_mul_s64: movsxd(rax, regalloc.MapRegister(op.rs1)); - movsxd(rcx, regalloc.MapRegister(op.rs2)); + if (op.rs2.is_reg()) + movsxd(rcx, regalloc.MapRegister(op.rs2)); + else + mov(rcx, (s64)(s32)op.rs2._imm); mul(rcx); mov(regalloc.MapRegister(op.rd), eax); shr(rax, 32); @@ -764,6 +834,33 @@ public: break; case shop_pref: + if (op.rs1.is_imm()) + { + // this test shouldn't be necessary + if ((op.rs1._imm & 0xFC000000) == 0xE0000000) + { + mov(call_regs[0], op.rs1._imm); + if (mmu_enabled()) + { + mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc + + GenCall(do_sqw_mmu_no_ex); + } + else + { + if (CCN_MMUCR.AT == 1) + { + GenCall(do_sqw_mmu); + } + else + { + mov(call_regs64[1], (uintptr_t)sq_both); + GenCall(&do_sqw_nommu_local); + } + } + } + } + else { Xbyak::Reg32 rn; if (regalloc.IsAllocg(op.rs1)) @@ -810,16 +907,31 @@ public: movsx(regalloc.MapRegister(op.rd), al); break; case shop_ext_s16: - movsx(regalloc.MapRegister(op.rd), Xbyak::Reg16(regalloc.MapRegister(op.rs1).getIdx())); + movsx(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1).cvt16()); break; case shop_xtrct: - if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) - mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); - shr(regalloc.MapRegister(op.rd), 16); - mov(eax, regalloc.MapRegister(op.rs2)); - shl(eax, 16); - or_(regalloc.MapRegister(op.rd), eax); + { + Xbyak::Reg32 rd = regalloc.MapRegister(op.rd); + Xbyak::Reg32 rs1 = regalloc.MapRegister(op.rs1); + Xbyak::Reg32 rs2 = regalloc.MapRegister(op.rs2); + if (regalloc.mapg(op.rd) == regalloc.mapg(op.rs2)) + { + shl(rd, 16); + mov(eax, rs1); + shr(eax, 16); + or_(rd, eax); + break; + } + else if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) + { + mov(rd, rs1); + } + shr(rd, 16); + mov(eax, rs2); + shl(eax, 16); + or_(rd, eax); + } break; // @@ -859,15 +971,38 @@ public: break; case shop_fmac: - if (regalloc.mapf(op.rd) != regalloc.mapf(op.rs1)) - movss(regalloc.MapXRegister(op.rd), regalloc.MapXRegister(op.rs1)); - if (cpu.has(Xbyak::util::Cpu::tFMA)) - vfmadd231ss(regalloc.MapXRegister(op.rd), regalloc.MapXRegister(op.rs2), regalloc.MapXRegister(op.rs3)); - else { - movss(xmm0, regalloc.MapXRegister(op.rs2)); - mulss(xmm0, regalloc.MapXRegister(op.rs3)); - addss(regalloc.MapXRegister(op.rd), xmm0); + Xbyak::Xmm rs1 = regalloc.MapXRegister(op.rs1); + Xbyak::Xmm rs2 = regalloc.MapXRegister(op.rs2); + Xbyak::Xmm rs3 = regalloc.MapXRegister(op.rs3); + Xbyak::Xmm rd = regalloc.MapXRegister(op.rd); + if (rd == rs2) + { + movss(xmm1, rs2); + rs2 = xmm1; + } + if (rd == rs3) + { + movss(xmm2, rs3); + rs3 = xmm2; + } + if (op.rs1.is_imm()) + { + mov(eax, op.rs1._imm); + movd(rd, eax); + } + else if (rd != rs1) + { + movss(rd, rs1); + } + if (cpu.has(Xbyak::util::Cpu::tFMA)) + vfmadd231ss(rd, rs2, rs3); + else + { + movss(xmm0, rs2); + mulss(xmm0, rs3); + addss(rd, xmm0); + } } break; @@ -898,7 +1033,7 @@ public: break; case shop_fsca: - movzx(rax, Xbyak::Reg16(regalloc.MapRegister(op.rs1).getIdx())); + movzx(rax, regalloc.MapRegister(op.rs1).cvt16()); mov(rcx, (uintptr_t)&sin_table); #ifdef EXPLODE_SPANS movss(regalloc.MapXRegister(op.rd, 0), dword[rcx + rax * 8]); @@ -1359,6 +1494,17 @@ private: mov(rax, reinterpret_cast(ptr)); switch (size) { + case 1: + if (regalloc.IsAllocg(op.rd)) + movsx(regalloc.MapRegister(op.rd), byte[rax]); + else + { + movsx(eax, byte[rax]); + mov(rcx, (uintptr_t)op.rd.reg_ptr()); + mov(dword[rcx], eax); + } + break; + case 2: if (regalloc.IsAllocg(op.rd)) movsx(regalloc.MapRegister(op.rd), word[rax]); @@ -1383,6 +1529,23 @@ private: } break; + case 8: + mov(rcx, qword[rax]); +#ifdef EXPLODE_SPANS + if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1)) + { + movd(regalloc.MapXRegister(op.rd, 0), ecx); + shr(rcx, 32); + movd(regalloc.MapXRegister(op.rd, 1), ecx); + } + else +#endif + { + mov(rax, (uintptr_t)op.rd.reg_ptr()); + mov(qword[rax], rcx); + } + break; + default: die("Invalid immediate size"); break; @@ -1395,6 +1558,11 @@ private: switch(size) { + case 1: + GenCall((void (*)())ptr); + movsx(ecx, al); + break; + case 2: GenCall((void (*)())ptr); movsx(ecx, ax); @@ -1415,6 +1583,122 @@ private: return true; } + bool GenWriteMemImmediate(const shil_opcode& op, RuntimeBlockInfo* block) + { + if (!op.rs1.is_imm()) + return false; + u32 size = op.flags & 0x7f; + u32 addr = op.rs1._imm; + if (mmu_enabled()) + { + if ((addr >> 12) != (block->vaddr >> 12)) + // When full mmu is on, only consider addresses in the same 4k page + return false; + + u32 paddr; + u32 rv; + switch (size) + { + case 1: + rv = mmu_data_translation(addr, paddr); + break; + case 2: + rv = mmu_data_translation(addr, paddr); + break; + case 4: + case 8: + rv = mmu_data_translation(addr, paddr); + break; + } + if (rv != MMU_ERROR_NONE) + return false; + + addr = paddr; + } + bool isram = false; + void* ptr = _vmem_write_const(addr, isram, size); + + if (isram) + { + // Immediate pointer to RAM: super-duper fast access + mov(rax, reinterpret_cast(ptr)); + switch (size) + { + case 1: + if (regalloc.IsAllocg(op.rs2)) + mov(byte[rax], regalloc.MapRegister(op.rs2)); + else if (op.rs2.is_imm()) + mov(byte[rax], op.rs2._imm); + else + { + mov(rcx, (uintptr_t)op.rs2.reg_ptr()); + mov(ecx, dword[rcx]); + mov(byte[rax], ecx); + } + break; + + case 2: + if (regalloc.IsAllocg(op.rs2)) + mov(word[rax], regalloc.MapRegister(op.rs2)); + else if (op.rs2.is_imm()) + mov(word[rax], op.rs2._imm); + else + { + mov(rcx, (uintptr_t)op.rs2.reg_ptr()); + mov(ecx, dword[rcx]); + mov(word[rax], ecx); + } + break; + + case 4: + if (regalloc.IsAllocg(op.rs2)) + mov(dword[rax], regalloc.MapRegister(op.rs2)); + else if (regalloc.IsAllocf(op.rs2)) + movd(dword[rax], regalloc.MapXRegister(op.rs2)); + else if (op.rs2.is_imm()) + mov(dword[rax], op.rs2._imm); + else + { + mov(rcx, (uintptr_t)op.rd.reg_ptr()); + mov(ecx, dword[rcx]); + mov(dword[rax], ecx); + } + break; + + case 8: +#ifdef EXPLODE_SPANS + if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1)) + { + movd(call_regs[1], regalloc.MapXRegister(op.rs2, 1)); + shl(call_regs64[1], 32); + movd(eax, regalloc.MapXRegister(op.rs2, 0)); + or_(call_regs64[1], rax); + } + else +#endif + { + mov(rcx, (uintptr_t)op.rd.reg_ptr()); + mov(rcx, qword[rcx]); + mov(qword[rax], rcx); + } + + default: + die("Invalid immediate size"); + break; + } + } + else + { + // Not RAM: the returned pointer is a memory handler + mov(call_regs[0], addr); + shil_param_to_host_reg(op.rs2, call_regs[1]); + + GenCall((void (*)())ptr); + } + + return true; + } + bool GenReadMemoryFast(const shil_opcode& op, RuntimeBlockInfo* block) { if (!mmu_enabled() || !vmem32_enabled()) @@ -1487,11 +1771,11 @@ private: switch (size) { case 1: - mov(byte[rax + call_regs64[0] + 0], Xbyak::Reg8(call_regs[1].getIdx(), call_regs[1] == edi || call_regs[1] == esi)); + mov(byte[rax + call_regs64[0] + 0], call_regs[1].cvt8()); break; case 2: - mov(word[rax + call_regs64[0]], Xbyak::Reg16(call_regs[1].getIdx())); + mov(word[rax + call_regs64[0]], call_regs[1].cvt16()); break; case 4: @@ -1589,22 +1873,67 @@ private: void GenBinaryOp(const shil_opcode &op, X64BinaryOp natop) { + Xbyak::Reg32 rd = regalloc.MapRegister(op.rd); + const shil_param *rs2 = &op.rs2; if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1)) - mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1)); + { + if (op.rs2.is_reg() && regalloc.mapg(op.rd) == regalloc.mapg(op.rs2)) + { + if (op.op == shop_sub) + { + // This op isn't commutative + mov(ecx, regalloc.MapRegister(op.rs2)); + mov(rd, regalloc.MapRegister(op.rs1)); + (this->*natop)(rd, ecx); + + return; + } + // otherwise just swap the operands + rs2 = &op.rs1; + } + else + mov(rd, regalloc.MapRegister(op.rs1)); + } if (op.rs2.is_imm()) { mov(ecx, op.rs2._imm); - (this->*natop)(regalloc.MapRegister(op.rd), ecx); + (this->*natop)(rd, ecx); } else - (this->*natop)(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs2)); + (this->*natop)(rd, regalloc.MapRegister(*rs2)); } void GenBinaryFOp(const shil_opcode &op, X64BinaryFOp natop) { + Xbyak::Xmm rd = regalloc.MapXRegister(op.rd); + const shil_param *rs2 = &op.rs2; if (regalloc.mapf(op.rd) != regalloc.mapf(op.rs1)) - movss(regalloc.MapXRegister(op.rd), regalloc.MapXRegister(op.rs1)); - (this->*natop)(regalloc.MapXRegister(op.rd), regalloc.MapXRegister(op.rs2)); + { + if (op.rs2.is_reg() && regalloc.mapf(op.rd) == regalloc.mapf(op.rs2)) + { + if (op.op == shop_fsub || op.op == shop_fdiv) + { + // these ops aren't commutative so we need a scratch reg + movss(xmm0, regalloc.MapXRegister(op.rs2)); + movss(rd, regalloc.MapXRegister(op.rs1)); + (this->*natop)(rd, xmm0); + + return; + } + // otherwise just swap the operands + rs2 = &op.rs1; + } + else + movss(rd, regalloc.MapXRegister(op.rs1)); + } + if (op.rs2.is_imm()) + { + mov(eax, op.rs2._imm); + movd(xmm0, eax); + (this->*natop)(rd, xmm0); + } + else + (this->*natop)(rd, regalloc.MapXRegister(*rs2)); } template @@ -1693,10 +2022,11 @@ private: { if (regalloc.IsAllocf(param)) { + Xbyak::Xmm sreg = regalloc.MapXRegister(param); if (!reg.isXMM()) - movd((const Xbyak::Reg32 &)reg, regalloc.MapXRegister(param)); - else - movss((const Xbyak::Xmm &)reg, regalloc.MapXRegister(param)); + movd((const Xbyak::Reg32 &)reg, sreg); + else if (reg != sreg) + movss((const Xbyak::Xmm &)reg, sreg); } else { @@ -1709,10 +2039,11 @@ private: { if (regalloc.IsAllocg(param)) { - if (!reg.isXMM()) - mov((const Xbyak::Reg32 &)reg, regalloc.MapRegister(param)); - else - movd((const Xbyak::Xmm &)reg, regalloc.MapRegister(param)); + Xbyak::Reg32 sreg = regalloc.MapRegister(param); + if (reg.isXMM()) + movd((const Xbyak::Xmm &)reg, sreg); + else if (reg != sreg) + mov((const Xbyak::Reg32 &)reg, sreg); } else { @@ -1735,17 +2066,19 @@ private: { if (regalloc.IsAllocg(param)) { + Xbyak::Reg32 sreg = regalloc.MapRegister(param); if (!reg.isXMM()) - mov(regalloc.MapRegister(param), (const Xbyak::Reg32 &)reg); - else - movd(regalloc.MapRegister(param), (const Xbyak::Xmm &)reg); + mov(sreg, (const Xbyak::Reg32 &)reg); + else if (reg != sreg) + movd(sreg, (const Xbyak::Xmm &)reg); } else if (regalloc.IsAllocf(param)) { + Xbyak::Xmm sreg = regalloc.MapXRegister(param); if (!reg.isXMM()) - movd(regalloc.MapXRegister(param), (const Xbyak::Reg32 &)reg); - else - movss(regalloc.MapXRegister(param), (const Xbyak::Xmm &)reg); + movd(sreg, (const Xbyak::Reg32 &)reg); + else if (reg != sreg) + movss(sreg, (const Xbyak::Xmm &)reg); } else { diff --git a/core/rec-x64/x64_regalloc.h b/core/rec-x64/x64_regalloc.h index 7614ba1f6..fe0de219d 100644 --- a/core/rec-x64/x64_regalloc.h +++ b/core/rec-x64/x64_regalloc.h @@ -20,8 +20,14 @@ #ifndef CORE_REC_X64_X64_REGALLOC_H_ #define CORE_REC_X64_X64_REGALLOC_H_ +//#define OLD_REGALLOC + #include "deps/xbyak/xbyak.h" +#ifdef OLD_REGALLOC #include "hw/sh4/dyna/regalloc.h" +#else +#include "hw/sh4/dyna/ssa_regalloc.h" +#endif #ifdef _WIN32 static Xbyak::Operand::Code alloc_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::RDI, Xbyak::Operand::RSI, @@ -65,7 +71,11 @@ struct X64RegAlloc : RegAllocnregf == xmm.getIdx() && all_spans[sid]->contains(opid)) return true; } return false; +#endif } BlockCompiler *compiler;