dynarec: reg alloc 64-bit regs. avoid some interpreter fallbacks

Option to reg alloc 64-bit regs in two host regs. Used when FPSCR.SZ ==
1 (64-bit reg and memory transfers.) Enabled for arm, arm64 and x64
(windows only) dynarecs.
Don't fallback to interpreter when FPSCR.PR==1 (double precision) for
FMOV, FLDS and FLTS.
This commit is contained in:
Flyinghead 2022-12-23 16:06:54 +01:00
parent 95a00a165a
commit 62085539a7
16 changed files with 474 additions and 298 deletions

View File

@ -48,7 +48,7 @@ static const char idle_hash[] =
static inline shil_param mk_imm(u32 immv)
{
return shil_param(FMT_IMM,immv);
return shil_param(immv);
}
static inline shil_param mk_reg(Sh4RegType reg)
@ -63,17 +63,18 @@ static inline shil_param mk_regi(int reg)
static state_t state;
static void Emit(shilop op,shil_param rd=shil_param(),shil_param rs1=shil_param(),shil_param rs2=shil_param(),u32 flags=0,shil_param rs3=shil_param(),shil_param rd2=shil_param())
static void Emit(shilop op, shil_param rd = shil_param(), shil_param rs1 = shil_param(), shil_param rs2 = shil_param(),
u32 size = 0, shil_param rs3 = shil_param(), shil_param rd2 = shil_param())
{
shil_opcode sp;
sp.flags=flags;
sp.op=op;
sp.rd=(rd);
sp.rd2=(rd2);
sp.rs1=(rs1);
sp.rs2=(rs2);
sp.rs3=(rs3);
sp.size = size;
sp.op = op;
sp.rd = rd;
sp.rd2 = rd2;
sp.rs1 = rs1;
sp.rs2 = rs2;
sp.rs3 = rs3;
sp.guest_offs = state.cpu.rpc - blk->vaddr;
sp.delay_slot = state.cpu.is_delayslot;
@ -83,12 +84,12 @@ static void Emit(shilop op,shil_param rd=shil_param(),shil_param rs1=shil_param(
static void dec_fallback(u32 op)
{
shil_opcode opcd;
opcd.op=shop_ifb;
opcd.op = shop_ifb;
opcd.rs1=shil_param(FMT_IMM,OpDesc[op]->NeedPC());
opcd.rs1 = shil_param(OpDesc[op]->NeedPC());
opcd.rs2=shil_param(FMT_IMM,state.cpu.rpc+2);
opcd.rs3=shil_param(FMT_IMM,op);
opcd.rs2 = shil_param(state.cpu.rpc + 2);
opcd.rs3 = shil_param(op);
opcd.guest_offs = state.cpu.rpc - blk->vaddr;
opcd.delay_slot = state.cpu.is_delayslot;
@ -671,9 +672,13 @@ static bool dec_generic(u32 op)
if (op>=0xF000)
{
state.info.has_fpu=true;
if (state.cpu.FPR64)
if (state.cpu.FPR64) {
// fallback to interpreter for double float ops
return false;
// except fmov, flds and fsts that don't depend on PR
if (((op & 0xf) < 6 || (op & 0xf) > 0xc) // fmov
&& (op & 0xef) != 0x0d) // flds, flts
return false;
}
if (state.cpu.FSZ64 && (d==PRM_FRN_SZ || d==PRM_FRM_SZ || s==PRM_FRN_SZ || s==PRM_FRM_SZ))
transfer_64 = true;

View File

@ -7,23 +7,19 @@ extern shil_chfp* shil_chf[];
enum shil_param_type
{
//2 bits
FMT_NULL,
FMT_IMM,
FMT_I32,
FMT_F32,
FMT_F64,
FMT_V2,
FMT_V3,
FMT_V4,
FMT_V8,
FMT_V16,
FMT_REG_BASE=FMT_I32,
FMT_VECTOR_BASE=FMT_V2,
FMT_REG_BASE = FMT_I32,
FMT_VECTOR_BASE = FMT_V4,
FMT_MASK=0xFFFF,
FMT_MASK = 0xFFFF,
};
/*
@ -39,56 +35,54 @@ struct shil_param
{
shil_param()
{
type=FMT_NULL;
_imm=0xFFFFFFFF;
type = FMT_NULL;
_imm = 0xFFFFFFFF;
memset(version, 0, sizeof(version));
}
shil_param(u32 type,u32 imm)
shil_param(u32 imm)
{
this->type=type;
if (type >= FMT_REG_BASE)
new (this) shil_param((Sh4RegType)imm);
_imm=imm;
this->type = FMT_IMM;
_imm = imm;
memset(version, 0, sizeof(version));
}
shil_param(Sh4RegType reg)
{
type=FMT_NULL;
if (reg>=reg_fr_0 && reg<=reg_xf_15)
if (reg >= reg_fr_0 && reg <= reg_xf_15)
{
type=FMT_F32;
_imm=reg;
type = FMT_F32;
_imm = reg;
}
else if (reg>=regv_dr_0 && reg<=regv_dr_14)
else if (reg >= regv_dr_0 && reg <= regv_dr_14)
{
type=FMT_F64;
_imm=(reg-regv_dr_0)*2+reg_fr_0;
type = FMT_F64;
_imm = (reg - regv_dr_0) * 2 + reg_fr_0;
}
else if (reg>=regv_xd_0 && reg<=regv_xd_14)
else if (reg >= regv_xd_0 && reg <= regv_xd_14)
{
type=FMT_F64;
_imm=(reg-regv_xd_0)*2+reg_xf_0;
type = FMT_F64;
_imm = (reg - regv_xd_0) * 2 + reg_xf_0;
}
else if (reg>=regv_fv_0 && reg<=regv_fv_12)
else if (reg >= regv_fv_0 && reg <= regv_fv_12)
{
type=FMT_V4;
_imm=(reg-regv_fv_0)*4+reg_fr_0;
type = FMT_V4;
_imm = (reg - regv_fv_0) * 4 + reg_fr_0;
}
else if (reg==regv_xmtrx)
else if (reg == regv_xmtrx)
{
type=FMT_V16;
_imm=reg_xf_0;
type = FMT_V16;
_imm = reg_xf_0;
}
else if (reg==regv_fmtrx)
else if (reg == regv_fmtrx)
{
type=FMT_V16;
_imm=reg_fr_0;
type = FMT_V16;
_imm = reg_fr_0;
}
else
{
type=FMT_I32;
_reg=reg;
type = FMT_I32;
_reg = reg;
}
memset(version, 0, sizeof(version));
}
@ -106,25 +100,22 @@ struct shil_param
bool is_r32i() const { return type==FMT_I32; }
bool is_r32f() const { return type==FMT_F32; }
u32 is_r32fv() const { return type>=FMT_VECTOR_BASE?count():0; }
u32 is_r32fv() const { return type >= FMT_VECTOR_BASE ? count() : 0; }
bool is_r64f() const { return type==FMT_F64; }
bool is_r32() const { return is_r32i() || is_r32f(); }
bool is_r64() const { return is_r64f(); } //just here for symmetry ...
bool is_imm_s8() const { return is_imm() && (int8_t)_imm == (int32_t)_imm; }
u32* reg_ptr() const { verify(is_reg()); return GetRegPtr(_reg); }
s32 reg_nofs() const { verify(is_reg()); return (s32)((u8*)GetRegPtr(_reg) - (u8*)GetRegPtr(reg_xf_0)-sizeof(Sh4cntx)); }
u32 reg_aofs() const { return -reg_nofs(); }
u32* reg_ptr() const { verify(is_reg()); return GetRegPtr(_reg); }
s32 reg_nofs() const { verify(is_reg()); return (s32)((u8*)GetRegPtr(_reg) - (u8*)GetRegPtr(reg_xf_0)-sizeof(Sh4cntx)); }
u32 reg_aofs() const { return -reg_nofs(); }
u32 imm_value() const { verify(is_imm()); return _imm; }
bool is_vector() const { return type>=FMT_VECTOR_BASE; }
u32 count() const { return type==FMT_F64?2:type==FMT_V2?2:
type==FMT_V3?3:type==FMT_V4?4:type==FMT_V8?8:
type==FMT_V16?16:1; } //count of hardware regs
u32 count() const { return type == FMT_F64 ? 2 :
type == FMT_V4 ? 4 :
type == FMT_V16 ? 16 : 1; } //count of hardware regs
/*
Imms:
@ -134,20 +125,18 @@ struct shil_param
integer regs : is_r32i,is_r32,count=1
fpu regs, single view : is_r32f,is_r32,count=1
fpu regs, double view : is_r64f,count=2
fpu regs, quad view : is_vector,is_r32fv=4, count=4
fpu regs, matrix view : is_vector,is_r32fv=16, count=16
fpu regs, quad view : is_r32fv=4, count=4
fpu regs, matrix view : is_r32fv=16, count=16
*/
};
struct shil_opcode
{
shilop op;
u32 Flow;
u32 flags;
u32 flags2;
u32 size; // memory access size
shil_param rd,rd2;
shil_param rs1,rs2,rs3;
shil_param rd, rd2;
shil_param rs1, rs2, rs3;
u16 host_offs;
u16 guest_offs;

View File

@ -86,7 +86,7 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op)
shil_param op2_rd = shil_param(op->rd2._reg);
op2_rd.version[0] = op->rd2.version[0];
InsertMov32Op(op2_rd, shil_param(FMT_IMM, rd2));
InsertMov32Op(op2_rd, shil_param(rd2));
// the previous insert might have invalidated our reference
op = &block->oplist[opnum - 1];
@ -151,7 +151,7 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op)
shil_param op2_rd = shil_param(op->rd2._reg);
op2_rd.version[0] = op->rd2.version[0];
InsertMov32Op(op2_rd, shil_param(FMT_IMM, rd2));
InsertMov32Op(op2_rd, shil_param(rd2));
// the previous insert might have invalidated our reference
op = &block->oplist[opnum - 1];
@ -201,7 +201,7 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op)
shil_param op2_rd = shil_param((Sh4RegType)(op->rd._reg + 1));
op2_rd.version[0] = op->rd.version[1];
InsertMov32Op(op2_rd, shil_param(FMT_IMM, res >> 32));
InsertMov32Op(op2_rd, shil_param(res >> 32));
// the previous insert might have invalidated our reference
op = &block->oplist[opnum - 1];
@ -328,7 +328,7 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op)
shil_param op2_rd = shil_param((Sh4RegType)(op->rd._reg + 1));
op2_rd.version[0] = op->rd.version[1];
InsertMov32Op(op2_rd, shil_param(FMT_IMM, rd_1));
InsertMov32Op(op2_rd, shil_param(rd_1));
// the previous insert might have invalidated our reference
op = &block->oplist[opnum - 1];

View File

@ -21,7 +21,6 @@
#include <cstdio>
#include <set>
#include <map>
#include <deque>
#include <cmath>
#include "types.h"
#include "decoder.h"
@ -124,7 +123,7 @@ private:
{
verify(op.rd2.is_null());
op.op = shop_mov32;
op.rs1 = shil_param(FMT_IMM, v);
op.rs1 = shil_param(v);
op.rs2.type = FMT_NULL;
op.rs3.type = FMT_NULL;
stats.constant_ops_replaced++;
@ -235,7 +234,7 @@ private:
if (op.rs1.is_imm() && op.op == shop_readm && block->read_only
&& (op.rs1._imm >> 12) >= (block->vaddr >> 12)
&& (op.rs1._imm >> 12) <= ((block->vaddr + block->sh4_code_size - 1) >> 12)
&& (op.flags & 0x7f) <= 4)
&& op.size <= 4)
{
bool doit = false;
if (mmu_enabled())
@ -262,7 +261,7 @@ private:
if (doit)
{
u32 v;
switch (op.flags & 0x7f)
switch (op.size)
{
case 1:
v = (s32)(::s8)ReadMem8(op.rs1._imm);
@ -513,7 +512,7 @@ private:
// There's quite a few of these
//printf("%08x +t<< %s\n", block->vaddr + op.guest_offs, op.dissasm().c_str());
op.op = shop_shl;
op.rs2 = shil_param(FMT_IMM, 1);
op.rs2 = shil_param(1);
}
// a ^ a == 0
// a - a == 0
@ -526,8 +525,8 @@ private:
else if (op.op == shop_sbc)
{
//printf("%08x ZERO %s\n", block->vaddr + op.guest_offs, op.dissasm().c_str());
op.rs1 = shil_param(FMT_IMM, 0);
op.rs2 = shil_param(FMT_IMM, 0);
op.rs1 = shil_param(0);
op.rs2 = shil_param(0);
stats.prop_constants += 2;
}
// a & a == a

View File

@ -28,7 +28,7 @@
#define ssa_printf(...) DEBUG_LOG(DYNAREC, __VA_ARGS__)
template<typename nreg_t, typename nregf_t>
template<typename nreg_t, typename nregf_t, bool AllocVec2 = false>
class RegAlloc
{
public:
@ -78,17 +78,17 @@ public:
FlushReg((Sh4RegType)i, true);
}
// Flush regs used by vector ops
if (op->rs1.is_reg() && op->rs1.count() > 1)
if (op->rs1.is_reg() && op->rs1.count() > MaxVecSize)
{
for (u32 i = 0; i < op->rs1.count(); i++)
FlushReg((Sh4RegType)(op->rs1._reg + i), false);
}
if (op->rs2.is_reg() && op->rs2.count() > 1)
if (op->rs2.is_reg() && op->rs2.count() > MaxVecSize)
{
for (u32 i = 0; i < op->rs2.count(); i++)
FlushReg((Sh4RegType)(op->rs2._reg + i), false);
}
if (op->rs3.is_reg() && op->rs3.count() > 1)
if (op->rs3.is_reg() && op->rs3.count() > MaxVecSize)
{
for (u32 i = 0; i < op->rs3.count(); i++)
FlushReg((Sh4RegType)(op->rs3._reg + i), false);
@ -100,7 +100,7 @@ public:
AllocSourceReg(op->rs3);
// Hard flush vector ops destination regs
// Note that this is incorrect if a reg is both src (scalar) and dest (vec). However such an op doesn't exist.
if (op->rd.is_reg() && op->rd.count() > 1)
if (op->rd.is_reg() && op->rd.count() > MaxVecSize)
{
for (u32 i = 0; i < op->rd.count(); i++)
{
@ -108,7 +108,7 @@ public:
FlushReg((Sh4RegType)(op->rd._reg + i), true);
}
}
if (op->rd2.is_reg() && op->rd2.count() > 1)
if (op->rd2.is_reg() && op->rd2.count() > MaxVecSize)
{
for (u32 i = 0; i < op->rd2.count(); i++)
{
@ -133,9 +133,7 @@ public:
// Flush normally
for (auto const& reg : reg_alloced)
{
FlushReg(reg.first, false);
}
// Hard flush all dirty regs. Useful for troubleshooting
// while (!reg_alloced.empty())
@ -175,7 +173,7 @@ public:
bool rv = IsAllocAny(prm._reg);
if (prm.count() != 1)
{
for (u32 i = 1;i < prm.count(); i++)
for (u32 i = 1; i < prm.count(); i++)
verify(IsAllocAny((Sh4RegType)(prm._reg + i)) == rv);
}
return rv;
@ -190,7 +188,8 @@ public:
{
if (prm.is_reg())
{
verify(prm.count() == 1);
if (prm.count() > MaxVecSize)
return false;
return IsAllocg(prm._reg);
}
else
@ -203,7 +202,8 @@ public:
{
if (prm.is_reg())
{
verify(prm.count() == 1);
if (prm.count() > MaxVecSize)
return false;
return IsAllocf(prm._reg);
}
else
@ -219,11 +219,11 @@ public:
return mapg(prm._reg);
}
nregf_t mapf(const shil_param& prm)
nregf_t mapf(const shil_param& prm, int index = 0)
{
verify(IsAllocf(prm));
verify(prm.count() == 1);
return mapf(prm._reg);
verify(prm.count() <= MaxVecSize);
return mapf((Sh4RegType)(prm._reg + index));
}
bool reg_used(nreg_t host_reg)
@ -266,6 +266,7 @@ private:
bool write_back;
bool dirty;
};
static constexpr u32 MaxVecSize = AllocVec2 ? 2 : 1;
bool IsFloat(Sh4RegType reg)
{
@ -309,11 +310,16 @@ private:
{
if (!fast_forwarding)
{
ssa_printf("WB %s.%d <- %cx", name_reg(reg_num).c_str(), reg_alloc.version, 'a' + reg_alloc.host_reg);
if (IsFloat(reg_num))
{
ssa_printf("WB %s.%d <- xmm%d", name_reg(reg_num).c_str(), reg_alloc.version, reg_alloc.host_reg);
Writeback_FPU(reg_num, (nregf_t)reg_alloc.host_reg);
}
else
{
ssa_printf("WB %s.%d <- %cx", name_reg(reg_num).c_str(), reg_alloc.version, 'a' + reg_alloc.host_reg);
Writeback(reg_num, (nreg_t)reg_alloc.host_reg);
}
}
reg_alloc.write_back = false;
reg_alloc.dirty = false;
@ -354,9 +360,12 @@ private:
void AllocSourceReg(const shil_param& param)
{
if (param.is_reg() && param.count() == 1) // TODO EXPLODE_SPANS?
if (!param.is_reg() || param.count() > MaxVecSize)
return;
for (u32 i = 0; i < param.count(); i++)
{
auto it = reg_alloced.find(param._reg);
Sh4RegType sh4reg = (Sh4RegType)(param._reg + i);
auto it = reg_alloced.find(sh4reg);
if (it == reg_alloced.end())
{
u32 host_reg;
@ -380,14 +389,19 @@ private:
host_reg = host_fregs.back();
host_fregs.pop_back();
}
reg_alloced[param._reg] = { host_reg, param.version[0], false, false };
reg_alloced[sh4reg] = { host_reg, param.version[i], false, false };
if (!fast_forwarding)
{
ssa_printf("PL %s.%d -> %cx", name_reg(param._reg).c_str(), param.version[0], 'a' + host_reg);
if (IsFloat(param._reg))
Preload_FPU(param._reg, (nregf_t)host_reg);
if (IsFloat(sh4reg))
{
ssa_printf("PL %s.%d -> xmm%d", name_reg(sh4reg).c_str(), param.version[i], host_reg);
Preload_FPU(sh4reg, (nregf_t)host_reg);
}
else
Preload(param._reg, (nreg_t)host_reg);
{
ssa_printf("PL %s.%d -> %cx", name_reg(sh4reg).c_str(), param.version[i], 'a' + host_reg);
Preload(sh4reg, (nreg_t)host_reg);
}
}
}
}
@ -424,9 +438,12 @@ private:
void AllocDestReg(const shil_param& param)
{
if (param.is_reg() && param.count() == 1) // TODO EXPLODE_SPANS?
if (!param.is_reg() || param.count() > MaxVecSize)
return;
for (u32 i = 0; i < param.count(); i++)
{
auto it = reg_alloced.find(param._reg);
Sh4RegType sh4reg = (Sh4RegType)(param._reg + i);
auto it = reg_alloced.find(sh4reg);
if (it == reg_alloced.end())
{
u32 host_reg;
@ -450,18 +467,21 @@ private:
host_reg = host_fregs.back();
host_fregs.pop_back();
}
reg_alloced[param._reg] = { host_reg, param.version[0], NeedsWriteBack(param._reg, param.version[0]), true };
ssa_printf(" %s.%d -> %cx %s", name_reg(param._reg).c_str(), param.version[0], 'a' + host_reg, reg_alloced[param._reg].write_back ? "(wb)" : "");
reg_alloced[sh4reg] = { host_reg, param.version[i], NeedsWriteBack(sh4reg, param.version[i]), true };
if (param.is_r32i())
ssa_printf(" %s.%d -> %cx %s", name_reg(sh4reg).c_str(), param.version[i], 'a' + host_reg, reg_alloced[sh4reg].write_back ? "(wb)" : "");
else
ssa_printf(" %s.%d -> xmm%d %s", name_reg(sh4reg).c_str(), param.version[i], host_reg, reg_alloced[sh4reg].write_back ? "(wb)" : "");
}
else
{
reg_alloc& reg = reg_alloced[param._reg];
reg_alloc& reg = reg_alloced[sh4reg];
verify(!reg.write_back);
reg.write_back = NeedsWriteBack(param._reg, param.version[0]);
reg.write_back = NeedsWriteBack(sh4reg, param.version[i]);
reg.dirty = true;
reg.version = param.version[0];
reg.version = param.version[i];
}
verify(reg_alloced[param._reg].dirty);
verify(reg_alloced[sh4reg].dirty);
}
}
@ -544,22 +564,26 @@ private:
bool IsVectorOp(shil_opcode* op)
{
return op->rs1.count() > 1 || op->rs2.count() > 1 || op->rs3.count() > 1 || op->rd.count() > 1 || op->rd2.count() > 1;
return op->rs1.count() > MaxVecSize
|| op->rs2.count() > MaxVecSize
|| op->rs3.count() > MaxVecSize
|| op->rd.count() > MaxVecSize
|| op->rd2.count() > MaxVecSize;
}
bool UsesReg(shil_opcode* op, Sh4RegType reg, u32 version, bool vector)
{
if (op->rs1.is_reg() && reg >= op->rs1._reg && reg < (Sh4RegType)(op->rs1._reg + op->rs1.count())
&& version == op->rs1.version[reg - op->rs1._reg]
&& vector == (op->rs1.count() > 1))
&& vector == (op->rs1.count() > MaxVecSize))
return true;
if (op->rs2.is_reg() && reg >= op->rs2._reg && reg < (Sh4RegType)(op->rs2._reg + op->rs2.count())
&& version == op->rs2.version[reg - op->rs2._reg]
&& vector == (op->rs2.count() > 1))
&& vector == (op->rs2.count() > MaxVecSize))
return true;
if (op->rs3.is_reg() && reg >= op->rs3._reg && reg < (Sh4RegType)(op->rs3._reg + op->rs3.count())
&& version == op->rs3.version[reg - op->rs3._reg]
&& vector == (op->rs3.count() > 1))
&& vector == (op->rs3.count() > MaxVecSize))
return true;
return false;
@ -568,10 +592,10 @@ private:
bool DefsReg(shil_opcode* op, Sh4RegType reg, bool vector)
{
if (op->rd.is_reg() && reg >= op->rd._reg && reg < (Sh4RegType)(op->rd._reg + op->rd.count())
&& vector == (op->rd.count() > 1))
&& vector == (op->rd.count() > MaxVecSize))
return true;
if (op->rd2.is_reg() && reg >= op->rd2._reg && reg < (Sh4RegType)(op->rd2._reg + op->rd2.count())
&& vector == (op->rd2.count() > 1))
&& vector == (op->rd2.count() > MaxVecSize))
return true;
return false;
}

View File

@ -62,6 +62,7 @@ void sh4_rio_reg(RegisterStruct *arr, u32 addr, RegIO flags, RegReadAddrFP* rf,
}
else
{
verify(!(flags & REG_WO)); // not supported here
if (flags & REG_RF)
arr[idx].readFunctionAddr = rf;
else

View File

@ -120,7 +120,7 @@ const int alloc_regs[] = { 5, 6, 7, 10, 11, -1 };
const int alloc_fpu[] = { 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, -1 };
struct arm_reg_alloc: RegAlloc<int, int>
struct arm_reg_alloc: RegAlloc<int, int, true>
{
void Preload(u32 reg, int nreg) override
{
@ -149,9 +149,9 @@ struct arm_reg_alloc: RegAlloc<int, int>
ass.Vstr(SRegister(nreg), MemOperand(r8, shRegOffs));
}
SRegister mapFReg(const shil_param& prm)
SRegister mapFReg(const shil_param& prm, int index = 0)
{
return SRegister(mapf(prm));
return SRegister(mapf(prm, index));
}
Register mapReg(const shil_param& prm)
{
@ -561,16 +561,15 @@ enum mem_op_type
static mem_op_type memop_type(shil_opcode* op)
{
int sz = op->flags & 0x7f;
bool fp32 = op->rs2.is_r32f() || op->rd.is_r32f();
if (sz == 1)
if (op->size == 1)
return SZ_8;
else if (sz == 2)
else if (op->size == 2)
return SZ_16;
else if (sz == 4)
else if (op->size == 4)
return fp32 ? SZ_32F : SZ_32I;
else if (sz == 8)
else if (op->size == 8)
return SZ_64F;
die("Unknown op");
@ -855,16 +854,15 @@ static bool ngen_readm_immediate(RuntimeBlockInfo* block, shil_opcode* op, bool
if (!op->rs1.is_imm())
return false;
u32 size = op->flags & 0x7f;
u32 addr = op->rs1._imm;
if (mmu_enabled() && mmu_is_translated(addr, size))
if (mmu_enabled() && mmu_is_translated(addr, op->size))
{
if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12)))
// When full mmu is on, only consider addresses in the same 4k page
return false;
u32 paddr;
u32 rv;
switch (size)
switch (op->size)
{
case 1:
rv = mmu_data_translation<MMU_TT_DREAD, u8>(addr, paddr);
@ -914,8 +912,16 @@ static bool ngen_readm_immediate(RuntimeBlockInfo* block, shil_opcode* op, bool
break;
case SZ_64F:
ass.Vldr(d0, MemOperand(r0));
ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
if (reg.IsAllocf(op->rd))
{
ass.Vldr(reg.mapFReg(op->rd, 0), MemOperand(r0));
ass.Vldr(reg.mapFReg(op->rd, 1), MemOperand(r0, 4));
}
else
{
ass.Vldr(d0, MemOperand(r0));
ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
}
break;
}
}
@ -928,11 +934,17 @@ static bool ngen_readm_immediate(RuntimeBlockInfo* block, shil_opcode* op, bool
// Need to call the handler twice
ass.Mov(r0, op->rs1._imm);
call(ptr);
ass.Str(r0, MemOperand(r8, op->rd.reg_nofs()));
if (reg.IsAllocf(op->rd))
ass.Vmov(reg.mapFReg(op->rd, 0), r0);
else
ass.Str(r0, MemOperand(r8, op->rd.reg_nofs()));
ass.Mov(r0, op->rs1._imm + 4);
call(ptr);
ass.Str(r0, MemOperand(r8, op->rd.reg_nofs() + 4));
if (reg.IsAllocf(op->rd))
ass.Vmov(reg.mapFReg(op->rd, 1), r0);
else
ass.Str(r0, MemOperand(r8, op->rd.reg_nofs() + 4));
}
else
{
@ -975,16 +987,15 @@ static bool ngen_writemem_immediate(RuntimeBlockInfo* block, shil_opcode* op, bo
if (!op->rs1.is_imm())
return false;
u32 size = op->flags & 0x7f;
u32 addr = op->rs1._imm;
if (mmu_enabled() && mmu_is_translated(addr, size))
if (mmu_enabled() && mmu_is_translated(addr, op->size))
{
if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12)))
// When full mmu is on, only consider addresses in the same 4k page
return false;
u32 paddr;
u32 rv;
switch (size)
switch (op->size)
{
case 1:
rv = mmu_data_translation<MMU_TT_DWRITE, u8>(addr, paddr);
@ -1041,8 +1052,16 @@ static bool ngen_writemem_immediate(RuntimeBlockInfo* block, shil_opcode* op, bo
break;
case SZ_64F:
ass.Vldr(d0, MemOperand(r8, op->rs2.reg_nofs()));
ass.Vstr(d0, MemOperand(r0));
if (reg.IsAllocf(op->rs2))
{
ass.Vstr(reg.mapFReg(op->rs2, 0), MemOperand(r0));
ass.Vstr(reg.mapFReg(op->rs2, 1), MemOperand(r0, 4));
}
else
{
ass.Vldr(d0, MemOperand(r8, op->rs2.reg_nofs()));
ass.Vstr(d0, MemOperand(r0));
}
break;
default:
@ -1157,9 +1176,20 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o
case SZ_64F:
ass.Add(r1, r1, r8); //3 opcodes, there's no [REG+REG] VLDR
ass.Vldr(d0, MemOperand(r1)); //TODO: use reg alloc
ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
ass.Vldr(d0, MemOperand(r1));
if (reg.IsAllocf(op->rd))
{
ass.Vmov(r0, r1, d0);
ass.Vmov(reg.mapFReg(op->rd, 0), r0);
ass.Vmov(reg.mapFReg(op->rd, 1), r1);
// easier to do just this but we need to use a different op than 32f to distinguish during rewrite
//ass.Vldr(reg.mapFReg(op->rd, 0), MemOperand(r1));
//ass.Vldr(reg.mapFReg(op->rd, 1), MemOperand(r1, 4));
}
else
{
ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
}
break;
}
} else {
@ -1183,7 +1213,16 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o
case SZ_64F:
vmem_slowpath(raddr, r0, s0, d0, optp, true);
ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
if (reg.IsAllocf(op->rd))
{
ass.Vmov(r0, r1, d0);
ass.Vmov(reg.mapFReg(op->rd, 0), r0);
ass.Vmov(reg.mapFReg(op->rd, 1), r1);
}
else
{
ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
}
break;
}
}
@ -1201,9 +1240,19 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o
Register rs2 = r2;
SRegister rs2f = s2;
//TODO: use reg alloc
if (optp == SZ_64F)
ass.Vldr(d0, MemOperand(r8, op->rs2.reg_nofs()));
{
if (reg.IsAllocf(op->rs2))
{
ass.Vmov(r2, reg.mapFReg(op->rs2, 0));
ass.Vmov(r3, reg.mapFReg(op->rs2, 1));
ass.Vmov(d0, r2, r3);
}
else
{
ass.Vldr(d0, MemOperand(r8, op->rs2.reg_nofs()));
}
}
else if (op->rs2.is_imm())
{
ass.Mov(rs2, op->rs2._imm);
@ -1242,7 +1291,7 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o
case SZ_64F:
ass.Add(r1, r1, r8); //3 opcodes: there's no [REG+REG] VLDR, also required for SQ
ass.Vstr(d0, MemOperand(r1)); //TODO: use reg alloc
ass.Vstr(d0, MemOperand(r1));
break;
}
} else {
@ -1358,9 +1407,18 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o
break;
case shop_mov64:
verify(op->rs1.is_r64() && op->rd.is_r64());
ass.Vldr(d0, MemOperand(r8, op->rs1.reg_nofs()));
ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
verify(op->rs1.is_r64f() && op->rd.is_r64f());
if (reg.IsAllocf(op->rd))
{
verify(reg.IsAllocf(op->rs1));
ass.Vmov(reg.mapFReg(op->rd, 0), reg.mapFReg(op->rs1, 0));
ass.Vmov(reg.mapFReg(op->rd, 1), reg.mapFReg(op->rs1, 1));
}
else
{
ass.Vldr(d0, MemOperand(r8, op->rs1.reg_nofs()));
ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
}
break;
case shop_jcond:
@ -1821,8 +1879,16 @@ static void ngen_compile_opcode(RuntimeBlockInfo* block, shil_opcode* op, bool o
ass.Add(r0, r1, Operand(r0, LSL, 3));
ass.Vldr(d0, MemOperand(r0));
ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
if (reg.IsAllocf(op->rd))
{
ass.Vldr(reg.mapFReg(op->rd, 0), MemOperand(r0));
ass.Vldr(reg.mapFReg(op->rd, 1), MemOperand(r0, 4));
}
else
{
ass.Vldr(d0, MemOperand(r0));
ass.Vstr(d0, MemOperand(r8, op->rd.reg_nofs()));
}
break;
case shop_fipr:

View File

@ -21,6 +21,8 @@
#include <aarch64/macro-assembler-aarch64.h>
using namespace vixl::aarch64;
#define ALLOC_F64 true
enum eReg {
W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, W16,
W17, W18, W19, W20, W21, W22, W23, W24, W25, W26, W27, W28, W29, W30
@ -35,7 +37,7 @@ static eFReg alloc_fregs[] = { S8, S9, S10, S11, S12, S13, S14, S15, (eFReg)-1 }
class Arm64Assembler;
struct Arm64RegAlloc : RegAlloc<eReg, eFReg>
struct Arm64RegAlloc : RegAlloc<eReg, eFReg, ALLOC_F64>
{
Arm64RegAlloc(Arm64Assembler *assembler) : assembler(assembler) {}
@ -57,9 +59,9 @@ struct Arm64RegAlloc : RegAlloc<eReg, eFReg>
return Register::GetWRegFromCode(ereg);
}
const VRegister& MapVRegister(const shil_param& param)
const VRegister& MapVRegister(const shil_param& param, int index = 0)
{
eFReg ereg = mapf(param);
eFReg ereg = mapf(param, index);
if (ereg == (eFReg)-1)
die("VRegister not allocated");
return VRegister::GetSRegFromCode(ereg);

View File

@ -387,11 +387,28 @@ public:
break;
case shop_mov64:
verify(op.rd.is_reg());
verify(op.rs1.is_reg() || op.rs1.is_imm());
{
verify(op.rd.is_reg());
verify(op.rs1.is_reg() || op.rs1.is_imm());
shil_param_to_host_reg(op.rs1, x15);
host_reg_to_shil_param(op.rd, x15);
if (!regalloc.IsAllocf(op.rd))
{
verify(!regalloc.IsAllocf(op.rs1));
shil_param_to_host_reg(op.rs1, x15);
host_reg_to_shil_param(op.rd, x15);
}
else
{
const VRegister& rd0 = regalloc.MapVRegister(op.rd, 0);
const VRegister& rs0 = regalloc.MapVRegister(op.rs1, 0);
if (!rd0.Is(rs0))
Fmov(rd0, rs0);
const VRegister& rd1 = regalloc.MapVRegister(op.rd, 1);
const VRegister& rs1 = regalloc.MapVRegister(op.rs1, 1);
if (!rd1.Is(rs1))
Fmov(rd1, rs1);
}
}
break;
case shop_readm:
@ -904,8 +921,15 @@ public:
Add(x1, x1, Operand(regalloc.MapRegister(op.rs1), UXTH, 3));
else
Add(x1, x1, Operand(op.rs1.imm_value() << 3));
Ldr(x2, MemOperand(x1));
Str(x2, sh4_context_mem_operand(op.rd.reg_ptr()));
if (regalloc.IsAllocf(op.rd))
{
Ldp(regalloc.MapVRegister(op.rd, 0), regalloc.MapVRegister(op.rd, 1), MemOperand(x1));
}
else
{
Ldr(x2, MemOperand(x1));
Str(x2, sh4_context_mem_operand(op.rd.reg_ptr()));
}
break;
case shop_fipr:
@ -1659,14 +1683,13 @@ private:
GenMemAddr(op, &w0);
genMmuLookup(op, 0);
u32 size = op.flags & 0x7f;
if (!optimise || !GenReadMemoryFast(op, opid))
GenReadMemorySlow(size);
GenReadMemorySlow(op.size);
if (size < 8)
if (op.size < 8)
host_reg_to_shil_param(op.rd, w0);
else
Str(x0, sh4_context_mem_operand(op.rd.reg_ptr()));
host_reg_to_shil_param(op.rd, x0);
}
bool GenReadMemoryImmediate(const shil_opcode& op)
@ -1674,16 +1697,15 @@ private:
if (!op.rs1.is_imm())
return false;
u32 size = op.flags & 0x7f;
u32 addr = op.rs1._imm;
if (mmu_enabled() && mmu_is_translated(addr, size))
if (mmu_enabled() && mmu_is_translated(addr, op.size))
{
if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12)))
// When full mmu is on, only consider addresses in the same 4k page
return false;
u32 paddr;
u32 rv;
switch (size)
switch (op.size)
{
case 1:
rv = mmu_data_translation<MMU_TT_DREAD, u8>(addr, paddr);
@ -1705,14 +1727,14 @@ private:
addr = paddr;
}
bool isram = false;
void* ptr = _vmem_read_const(addr, isram, size > 4 ? 4 : size);
void* ptr = _vmem_read_const(addr, isram, op.size > 4 ? 4 : op.size);
if (isram)
{
Ldr(x1, reinterpret_cast<uintptr_t>(ptr)); // faster than Mov
if (regalloc.IsAllocAny(op.rd))
{
switch (size)
switch (op.size)
{
case 1:
Ldrsb(regalloc.MapRegister(op.rd), MemOperand(x1));
@ -1729,6 +1751,10 @@ private:
Ldr(regalloc.MapRegister(op.rd), MemOperand(x1));
break;
case 8:
Ldp(regalloc.MapVRegister(op.rd, 0), regalloc.MapVRegister(op.rd, 1), MemOperand(x1));
break;
default:
die("Invalid size");
break;
@ -1736,7 +1762,7 @@ private:
}
else
{
switch (size)
switch (op.size)
{
case 1:
Ldrsb(w1, MemOperand(x1));
@ -1758,7 +1784,7 @@ private:
die("Invalid size");
break;
}
if (size == 8)
if (op.size == 8)
Str(x1, sh4_context_mem_operand(op.rd.reg_ptr()));
else
Str(w1, sh4_context_mem_operand(op.rd.reg_ptr()));
@ -1767,23 +1793,28 @@ private:
else
{
// Not RAM
if (size == 8)
if (op.size == 8)
{
verify(!regalloc.IsAllocAny(op.rd));
// Need to call the handler twice
Mov(w0, addr);
GenCallRuntime((void (*)())ptr);
Str(w0, sh4_context_mem_operand(op.rd.reg_ptr()));
if (regalloc.IsAllocf(op.rd))
Fmov(regalloc.MapVRegister(op.rd, 0), w0);
else
Str(w0, sh4_context_mem_operand(op.rd.reg_ptr()));
Mov(w0, addr + 4);
GenCallRuntime((void (*)())ptr);
Str(w0, sh4_context_mem_operand((u8*)op.rd.reg_ptr() + 4));
if (regalloc.IsAllocf(op.rd))
Fmov(regalloc.MapVRegister(op.rd, 1), w0);
else
Str(w0, sh4_context_mem_operand((u8*)op.rd.reg_ptr() + 4));
}
else
{
Mov(w0, addr);
switch(size)
switch(op.size)
{
case 1:
GenCallRuntime((void (*)())ptr);
@ -1830,8 +1861,7 @@ private:
Ubfx(x1, x0, 0, 29);
Add(x1, x1, sizeof(Sh4Context), LeaveFlags);
u32 size = op.flags & 0x7f;
switch(size)
switch (op.size)
{
case 1:
Ldrsb(w0, MemOperand(x28, x1));
@ -1862,15 +1892,14 @@ private:
GenMemAddr(op, &w0);
genMmuLookup(op, 1);
u32 size = op.flags & 0x7f;
if (size != 8)
if (op.size != 8)
shil_param_to_host_reg(op.rs2, w1);
else
shil_param_to_host_reg(op.rs2, x1);
if (optimise && GenWriteMemoryFast(op, opid))
return;
GenWriteMemorySlow(size);
GenWriteMemorySlow(op.size);
}
bool GenWriteMemoryImmediate(const shil_opcode& op)
@ -1878,16 +1907,15 @@ private:
if (!op.rs1.is_imm())
return false;
u32 size = op.flags & 0x7f;
u32 addr = op.rs1._imm;
if (mmu_enabled() && mmu_is_translated(addr, size))
if (mmu_enabled() && mmu_is_translated(addr, op.size))
{
if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12)))
// When full mmu is on, only consider addresses in the same 4k page
return false;
u32 paddr;
u32 rv;
switch (size)
switch (op.size)
{
case 1:
rv = mmu_data_translation<MMU_TT_DWRITE, u8>(addr, paddr);
@ -1909,11 +1937,11 @@ private:
addr = paddr;
}
bool isram = false;
void* ptr = _vmem_write_const(addr, isram, size > 4 ? 4 : size);
void* ptr = _vmem_write_const(addr, isram, op.size > 4 ? 4 : op.size);
Register reg2;
if (size != 8)
if (isram)
{
Register reg2;
if (op.rs2.is_imm())
{
Mov(w1, op.rs2._imm);
@ -1923,6 +1951,11 @@ private:
{
reg2 = regalloc.MapRegister(op.rs2);
}
else if (op.size == 8)
{
shil_param_to_host_reg(op.rs2, x1);
reg2 = x1;
}
else if (regalloc.IsAllocf(op.rs2))
{
Fmov(w1, regalloc.MapVRegister(op.rs2));
@ -1930,11 +1963,9 @@ private:
}
else
die("Invalid rs2 param");
}
if (isram)
{
Ldr(x0, reinterpret_cast<uintptr_t>(ptr));
switch (size)
switch (op.size)
{
case 1:
Strb(reg2, MemOperand(x0));
@ -1949,8 +1980,7 @@ private:
break;
case 8:
shil_param_to_host_reg(op.rs2, x1);
Str(x1, MemOperand(x0));
Str(reg2, MemOperand(x0));
break;
default:
@ -1962,10 +1992,10 @@ private:
{
// Not RAM
Mov(w0, addr);
if (size == 8)
shil_param_to_host_reg(op.rs2, x1);
if (op.size == 8)
{
// Need to call the handler twice
shil_param_to_host_reg(op.rs2, x1);
GenCallRuntime((void (*)())ptr);
Mov(w0, addr + 4);
@ -1975,7 +2005,6 @@ private:
}
else
{
Mov(w1, reg2);
GenCallRuntime((void (*)())ptr);
}
}
@ -1996,8 +2025,7 @@ private:
Ubfx(x7, x0, 0, 29);
Add(x7, x7, sizeof(Sh4Context), LeaveFlags);
u32 size = op.flags & 0x7f;
switch(size)
switch(op.size)
{
case 1:
Strb(w1, MemOperand(x28, x7));
@ -2112,21 +2140,28 @@ private:
}
else if (param.is_reg())
{
if (param.is_r64f())
if (param.is_r64f() && !regalloc.IsAllocf(param))
{
Ldr(reg, sh4_context_mem_operand(param.reg_ptr()));
else if (param.is_r32f())
}
else if (param.is_r32f() || param.is_r64f())
{
if (regalloc.IsAllocf(param))
Fmov(reg, regalloc.MapVRegister(param));
Fmov(reg.W(), regalloc.MapVRegister(param, 0));
else
Ldr(reg, sh4_context_mem_operand(param.reg_ptr()));
Ldr(reg.W(), sh4_context_mem_operand(param.reg_ptr()));
if (param.is_r64f())
{
Fmov(w15, regalloc.MapVRegister(param, 1));
Bfm(reg, x15, 32, 31);
}
}
else
{
if (regalloc.IsAllocg(param))
Mov(reg, regalloc.MapRegister(param));
Mov(reg.W(), regalloc.MapRegister(param));
else
Ldr(reg, sh4_context_mem_operand(param.reg_ptr()));
Ldr(reg.W(), sh4_context_mem_operand(param.reg_ptr()));
}
}
else
@ -2139,7 +2174,17 @@ private:
{
if (reg.Is64Bits())
{
Str((const Register&)reg, sh4_context_mem_operand(param.reg_ptr()));
if (regalloc.IsAllocf(param))
{
verify(param.count() == 2);
Fmov(regalloc.MapVRegister(param, 0), reg.W());
Lsr(reg.X(), reg.X(), 32);
Fmov(regalloc.MapVRegister(param, 1), reg.W());
}
else
{
Str((const Register&)reg, sh4_context_mem_operand(param.reg_ptr()));
}
}
else if (regalloc.IsAllocg(param))
{

View File

@ -1665,7 +1665,7 @@ public:
case shop_readm:
{
u32 size = op.flags & 0x7f;
u32 size = op.size;
if (op.rs1.is_imm()) {
verify(op.rs2.is_null() && op.rs3.is_null());
@ -1748,7 +1748,7 @@ public:
case shop_writem:
{
u32 size = op.flags & 0x7f;
u32 size = op.size;
if (op.rs1.is_imm()) {
verify(op.rs3.is_null());

View File

@ -215,13 +215,24 @@ public:
case shop_mov64:
{
verify(op.rd.is_r64());
verify(op.rs1.is_r64());
verify(op.rd.is_r64f());
verify(op.rs1.is_r64f());
#if ALLOC_F64 == false
mov(rax, (uintptr_t)op.rs1.reg_ptr());
mov(rax, qword[rax]);
mov(rcx, (uintptr_t)op.rd.reg_ptr());
mov(qword[rcx], rax);
#else
Xbyak::Xmm rd = regalloc.MapXRegister(op.rd, 0);
Xbyak::Xmm rs = regalloc.MapXRegister(op.rs1, 0);
if (rd != rs)
movss(rd, rs);
rd = regalloc.MapXRegister(op.rd, 1);
rs = regalloc.MapXRegister(op.rs1, 1);
if (rd != rs)
movss(rd, rs);
#endif
}
break;
@ -244,16 +255,18 @@ public:
}
genMmuLookup(block, op, 0);
int size = op.flags & 0x7f;
size = size == 1 ? MemSize::S8 : size == 2 ? MemSize::S16 : size == 4 ? MemSize::S32 : MemSize::S64;
int size = op.size == 1 ? MemSize::S8 : op.size == 2 ? MemSize::S16 : op.size == 4 ? MemSize::S32 : MemSize::S64;
GenCall((void (*)())MemHandlers[optimise ? MemType::Fast : MemType::Slow][size][MemOp::R], mmu_enabled());
if (size != MemSize::S64)
host_reg_to_shil_param(op.rd, eax);
else {
#if ALLOC_F64 == false
if (size == MemSize::S64)
{
mov(rcx, (uintptr_t)op.rd.reg_ptr());
mov(qword[rcx], rax);
}
else
#endif
host_reg_to_shil_param(op.rd, rax);
}
break;
@ -276,15 +289,17 @@ public:
}
genMmuLookup(block, op, 1);
u32 size = op.flags & 0x7f;
if (size != 8)
shil_param_to_host_reg(op.rs2, call_regs[1]);
else {
#if ALLOC_F64 == false
if (op.size == 8)
{
mov(rax, (uintptr_t)op.rs2.reg_ptr());
mov(call_regs64[1], qword[rax]);
}
else
#endif
shil_param_to_host_reg(op.rs2, call_regs64[1]);
size = size == 1 ? MemSize::S8 : size == 2 ? MemSize::S16 : size == 4 ? MemSize::S32 : MemSize::S64;
int size = op.size == 1 ? MemSize::S8 : op.size == 2 ? MemSize::S16 : op.size == 4 ? MemSize::S32 : MemSize::S64;
GenCall((void (*)())MemHandlers[optimise ? MemType::Fast : MemType::Slow][size][MemOp::W], mmu_enabled());
}
}
@ -809,9 +824,8 @@ private:
{
if (!op.rs1.is_imm())
return false;
u32 size = op.flags & 0x7f;
u32 addr = op.rs1._imm;
if (mmu_enabled() && mmu_is_translated(addr, size))
if (mmu_enabled() && mmu_is_translated(addr, op.size))
{
if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12)))
// When full mmu is on, only consider addresses in the same 4k page
@ -819,7 +833,7 @@ private:
u32 paddr;
u32 rv;
switch (size)
switch (op.size)
{
case 1:
rv = mmu_data_translation<MMU_TT_DREAD, u8>(addr, paddr);
@ -841,13 +855,13 @@ private:
addr = paddr;
}
bool isram = false;
void* ptr = _vmem_read_const(addr, isram, size > 4 ? 4 : size);
void* ptr = _vmem_read_const(addr, isram, op.size > 4 ? 4 : op.size);
if (isram)
{
// Immediate pointer to RAM: super-duper fast access
mov(rax, reinterpret_cast<uintptr_t>(ptr));
switch (size)
switch (op.size)
{
case 1:
if (regalloc.IsAllocg(op.rd))
@ -885,9 +899,14 @@ private:
break;
case 8:
#if ALLOC_F64 == false
mov(rcx, qword[rax]);
mov(rax, (uintptr_t)op.rd.reg_ptr());
mov(qword[rax], rcx);
#else
movd(regalloc.MapXRegister(op.rd, 0), dword[rax]);
movd(regalloc.MapXRegister(op.rd, 1), dword[rax + 4]);
#endif
break;
default:
@ -898,26 +917,32 @@ private:
else
{
// Not RAM: the returned pointer is a memory handler
if (size == 8)
if (op.size == 8)
{
verify(!regalloc.IsAllocAny(op.rd));
// Need to call the handler twice
mov(call_regs[0], addr);
GenCall((void (*)())ptr);
#if ALLOC_F64 == false
mov(rcx, (size_t)op.rd.reg_ptr());
mov(dword[rcx], eax);
#else
mov(regalloc.MapXRegister(op.rd, 0), eax);
#endif
mov(call_regs[0], addr + 4);
GenCall((void (*)())ptr);
#if ALLOC_F64 == false
mov(rcx, (size_t)op.rd.reg_ptr() + 4);
mov(dword[rcx], eax);
#else
mov(regalloc.MapXRegister(op.rd, 1), eax);
#endif
}
else
{
mov(call_regs[0], addr);
switch(size)
switch(op.size)
{
case 1:
GenCall((void (*)())ptr);
@ -948,9 +973,8 @@ private:
{
if (!op.rs1.is_imm())
return false;
u32 size = op.flags & 0x7f;
u32 addr = op.rs1._imm;
if (mmu_enabled() && mmu_is_translated(addr, size))
if (mmu_enabled() && mmu_is_translated(addr, op.size))
{
if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12)))
// When full mmu is on, only consider addresses in the same 4k page
@ -958,7 +982,7 @@ private:
u32 paddr;
u32 rv;
switch (size)
switch (op.size)
{
case 1:
rv = mmu_data_translation<MMU_TT_DWRITE, u8>(addr, paddr);
@ -980,13 +1004,13 @@ private:
addr = paddr;
}
bool isram = false;
void* ptr = _vmem_write_const(addr, isram, size > 4 ? 4 : size);
void* ptr = _vmem_write_const(addr, isram, op.size > 4 ? 4 : op.size);
if (isram)
{
// Immediate pointer to RAM: super-duper fast access
mov(rax, reinterpret_cast<uintptr_t>(ptr));
switch (size)
switch (op.size)
{
case 1:
if (regalloc.IsAllocg(op.rs2))
@ -1030,9 +1054,14 @@ private:
break;
case 8:
#if ALLOC_F64 == false
mov(rcx, (uintptr_t)op.rs2.reg_ptr());
mov(rcx, qword[rcx]);
mov(qword[rax], rcx);
#else
movd(dword[rax], regalloc.MapXRegister(op.rs2, 0));
movd(dword[rax + 4], regalloc.MapXRegister(op.rs2, 1));
#endif
break;
default:

View File

@ -25,15 +25,18 @@
static Xbyak::Operand::Code alloc_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::RDI, Xbyak::Operand::RSI,
Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15, (Xbyak::Operand::Code)-1 };
static s8 alloc_fregs[] = { 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1 }; // XMM6 to XMM15 are callee-saved in Windows
#define ALLOC_F64 true
#else
static Xbyak::Operand::Code alloc_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12, Xbyak::Operand::R13,
Xbyak::Operand::R14, Xbyak::Operand::R15, (Xbyak::Operand::Code)-1 };
static s8 alloc_fregs[] = { 8, 9, 10, 11, -1 }; // XMM8-11
// all xmm registers are caller-saved on linux
#define ALLOC_F64 false
#endif
class BlockCompiler;
struct X64RegAlloc : RegAlloc<Xbyak::Operand::Code, s8>
struct X64RegAlloc : RegAlloc<Xbyak::Operand::Code, s8, ALLOC_F64>
{
X64RegAlloc(BlockCompiler *compiler) : compiler(compiler) {}
@ -55,9 +58,9 @@ struct X64RegAlloc : RegAlloc<Xbyak::Operand::Code, s8>
return Xbyak::Reg32(ereg);
}
Xbyak::Xmm MapXRegister(const shil_param& param)
Xbyak::Xmm MapXRegister(const shil_param& param, int index = 0)
{
s8 ereg = mapf(param);
s8 ereg = mapf(param, index);
if (ereg == -1)
die("VRegister not allocated");
return Xbyak::Xmm(ereg);

View File

@ -587,13 +587,19 @@ protected:
#ifndef XBYAK32
mov(rcx, (uintptr_t)&sin_table);
mov(rcx, qword[rcx + rax * 8]);
#if ALLOC_F64 == false
mov(rdx, (uintptr_t)op.rd.reg_ptr());
mov(qword[rdx], rcx);
#else
movd(mapXRegister(op.rd, 0), ecx);
shr(rcx, 32);
movd(mapXRegister(op.rd, 1), ecx);
#endif
#endif
}
else
{
#ifdef EXPLODE_SPANS
#if ALLOC_F64 == true
movss(mapXRegister(op.rd, 0), dword[(size_t)&sin_table + eax * 8]);
movss(mapXRegister(op.rd, 1), dword[(size_t)&sin_table[0].u[1] + eax * 8]);
#else
@ -653,15 +659,25 @@ protected:
}
else if (param.is_reg())
{
if (param.is_r32f())
if (isAllocf(param))
{
if (isAllocf(param))
if (param.is_r32f() || param.is_r64f())
{
Xbyak::Xmm sreg = mapXRegister(param);
Xbyak::Xmm sreg = mapXRegister(param, 0);
if (!reg.isXMM())
movd((const Xbyak::Reg32 &)reg, sreg);
movd(reg.cvt32(), sreg);
else if (reg != sreg)
movss((const Xbyak::Xmm &)reg, sreg);
#ifndef XBYAK32
if (param.is_r64f())
{
sreg = mapXRegister(param, 1);
verify(reg != rax);
movd(eax, sreg);
shl(rax, 32);
or_(reg, rax);
}
#endif
}
else
{
@ -670,44 +686,41 @@ protected:
{
#ifndef XBYAK32
mov(rax, (size_t)param.reg_ptr());
mov((const Xbyak::Reg32 &)reg, dword[rax]);
mov(reg.cvt32(), dword[rax]);
#endif
}
else
{
mov((const Xbyak::Reg32 &)reg, dword[param.reg_ptr()]);
mov(reg.cvt32(), dword[param.reg_ptr()]);
}
}
}
else if (isAllocg(param))
{
Xbyak::Reg32 sreg = mapRegister(param);
if (reg.isXMM())
movd((const Xbyak::Xmm &)reg, sreg);
else if (reg != sreg)
mov(reg.cvt32(), sreg);
}
else
{
if (isAllocg(param))
if (ArchX64)
{
Xbyak::Reg32 sreg = mapRegister(param);
if (reg.isXMM())
movd((const Xbyak::Xmm &)reg, sreg);
else if (reg != sreg)
mov((const Xbyak::Reg32 &)reg, sreg);
#ifndef XBYAK32
mov(rax, (size_t)param.reg_ptr());
if (!reg.isXMM())
mov(reg.cvt32(), dword[rax]);
else
movss((const Xbyak::Xmm &)reg, dword[rax]);
#endif
}
else
{
if (ArchX64)
{
#ifndef XBYAK32
mov(rax, (size_t)param.reg_ptr());
if (!reg.isXMM())
mov((const Xbyak::Reg32 &)reg, dword[rax]);
else
movss((const Xbyak::Xmm &)reg, dword[rax]);
#endif
}
if (!reg.isXMM())
mov(reg.cvt32(), dword[param.reg_ptr()]);
else
{
if (!reg.isXMM())
mov((const Xbyak::Reg32 &)reg, dword[param.reg_ptr()]);
else
movss((const Xbyak::Xmm &)reg, dword[param.reg_ptr()]);
}
movss((const Xbyak::Xmm &)reg, dword[param.reg_ptr()]);
}
}
}
@ -724,17 +737,25 @@ protected:
{
Xbyak::Reg32 sreg = mapRegister(param);
if (!reg.isXMM())
mov(sreg, (const Xbyak::Reg32 &)reg);
mov(sreg, reg.cvt32());
else if (reg != sreg)
movd(sreg, (const Xbyak::Xmm &)reg);
}
else if (isAllocf(param))
{
Xbyak::Xmm sreg = mapXRegister(param);
Xbyak::Xmm sreg = mapXRegister(param, 0);
if (!reg.isXMM())
movd(sreg, (const Xbyak::Reg32 &)reg);
movd(sreg, reg.cvt32());
else if (reg != sreg)
movss(sreg, (const Xbyak::Xmm &)reg);
#ifndef XBYAK32
if (param.is_r64f())
{
sreg = mapXRegister(param, 1);
shr(reg, 32);
movd(sreg, reg.cvt32());
}
#endif
}
else
{
@ -743,7 +764,7 @@ protected:
#ifndef XBYAK32
mov(rax, (size_t)param.reg_ptr());
if (!reg.isXMM())
mov(dword[rax], (const Xbyak::Reg32 &)reg);
mov(dword[rax], reg.cvt32());
else
movss(dword[rax], (const Xbyak::Xmm &)reg);
#endif
@ -751,7 +772,7 @@ protected:
else
{
if (!reg.isXMM())
mov(dword[param.reg_ptr()], (const Xbyak::Reg32 &)reg);
mov(dword[param.reg_ptr()], reg.cvt32());
else
movss(dword[param.reg_ptr()], (const Xbyak::Xmm &)reg);
}
@ -763,16 +784,16 @@ private:
return static_cast<T*>(this)->regalloc.MapRegister(param);
}
Xbyak::Xmm mapXRegister(const shil_param& param) {
return static_cast<T*>(this)->regalloc.MapXRegister(param);
Xbyak::Xmm mapXRegister(const shil_param& param, int index = 0) {
return static_cast<T*>(this)->regalloc.MapXRegister(param, index);
}
int mapg(const shil_param& param) {
return (int)static_cast<T*>(this)->regalloc.mapg(param);
}
int mapf(const shil_param& param) {
return (int)static_cast<T*>(this)->regalloc.mapf(param);
int mapf(const shil_param& param, int index = 0) {
return (int)static_cast<T*>(this)->regalloc.mapf(param, index);
}
bool isAllocg(const shil_param& param) {

View File

@ -526,15 +526,14 @@ bool X86Compiler::genReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* b
{
if (!op.rs1.is_imm())
return false;
u32 size = op.flags & 0x7f;
u32 addr = op.rs1.imm_value();
bool isram = false;
void* ptr = _vmem_read_const(addr, isram, size > 4 ? 4 : size);
void* ptr = _vmem_read_const(addr, isram, op.size > 4 ? 4 : op.size);
if (isram)
{
// Immediate pointer to RAM: super-duper fast access
switch (size)
switch (op.size)
{
case 1:
if (regalloc.IsAllocg(op.rd))
@ -569,14 +568,12 @@ bool X86Compiler::genReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* b
break;
case 8:
#ifdef EXPLODE_SPANS
if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1))
if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd))
{
movd(regalloc.MapXRegister(op.rd, 0), dword[ptr]);
movd(regalloc.MapXRegister(op.rd, 1), dword[(u32 *)ptr + 1]);
}
else
#endif
{
movq(xmm0, qword[ptr]);
movq(qword[op.rd.reg_ptr()], xmm0);
@ -591,7 +588,7 @@ bool X86Compiler::genReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* b
else
{
// Not RAM: the returned pointer is a memory handler
if (size == 8)
if (op.size == 8)
{
verify(!regalloc.IsAllocAny(op.rd));
@ -608,7 +605,7 @@ bool X86Compiler::genReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* b
{
mov(ecx, addr);
switch(size)
switch(op.size)
{
case 1:
genCall((void (DYNACALL *)())ptr);
@ -639,15 +636,14 @@ bool X86Compiler::genWriteMemImmediate(const shil_opcode& op, RuntimeBlockInfo*
{
if (!op.rs1.is_imm())
return false;
u32 size = op.flags & 0x7f;
u32 addr = op.rs1.imm_value();
bool isram = false;
void* ptr = _vmem_write_const(addr, isram, size > 4 ? 4 : size);
void* ptr = _vmem_write_const(addr, isram, op.size > 4 ? 4 : op.size);
if (isram)
{
// Immediate pointer to RAM: super-duper fast access
switch (size)
switch (op.size)
{
case 1:
if (regalloc.IsAllocg(op.rs2))
@ -697,14 +693,12 @@ bool X86Compiler::genWriteMemImmediate(const shil_opcode& op, RuntimeBlockInfo*
break;
case 8:
#ifdef EXPLODE_SPANS
if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1))
if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2))
{
movd(dword[ptr], regalloc.MapXRegister(op.rs2, 0));
movd(dword[(u32 *)ptr + 1], regalloc.MapXRegister(op.rs2, 1));
}
else
#endif
{
movq(xmm0, qword[op.rs2.reg_ptr()]);
movq(qword[ptr], xmm0);

View File

@ -268,10 +268,10 @@ void X86Compiler::genOpcode(RuntimeBlockInfo* block, bool optimise, shil_opcode&
break;
case shop_mov64:
verify(op.rd.is_r64());
verify(op.rs1.is_r64());
verify(op.rd.is_r64f());
verify(op.rs1.is_r64f());
#ifdef EXPLODE_SPANS
#if ALLOC_F64 == true
movss(regalloc.MapXRegister(op.rd, 0), regalloc.MapXRegister(op.rs1, 0));
movss(regalloc.MapXRegister(op.rd, 1), regalloc.MapXRegister(op.rs1, 1));
#else
@ -297,7 +297,7 @@ void X86Compiler::genOpcode(RuntimeBlockInfo* block, bool optimise, shil_opcode&
}
int memOpSize;
switch (op.flags & 0x7f)
switch (op.size)
{
case 1:
memOpSize = MemSize::S8;
@ -329,14 +329,12 @@ void X86Compiler::genOpcode(RuntimeBlockInfo* block, bool optimise, shil_opcode&
}
else
{
#ifdef EXPLODE_SPANS
if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1))
if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd))
{
mov(regalloc.MapXRegister(op.rd, 0), xmm0);
mov(regalloc.MapXRegister(op.rd, 1), xmm1);
}
else
#endif
{
verify(!regalloc.IsAllocAny(op.rd));
movss(dword[op.rd.reg_ptr()], xmm0);
@ -361,7 +359,7 @@ void X86Compiler::genOpcode(RuntimeBlockInfo* block, bool optimise, shil_opcode&
}
int memOpSize;
switch (op.flags & 0x7f)
switch (op.size)
{
case 1:
memOpSize = MemSize::S8;
@ -382,14 +380,12 @@ void X86Compiler::genOpcode(RuntimeBlockInfo* block, bool optimise, shil_opcode&
else if (memOpSize == MemSize::F32)
shil_param_to_host_reg(op.rs2, xmm0);
else {
#ifdef EXPLODE_SPANS
if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1))
if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2))
{
mov(xmm0, regalloc.MapXRegister(op.rs2, 0));
mov(xmm1, regalloc.MapXRegister(op.rs2, 1));
}
else
#endif
{
movd(xmm0, dword[op.rs2.reg_ptr()]);
movd(xmm1, dword[op.rs2.reg_ptr() + 1]);

View File

@ -19,9 +19,11 @@
#pragma once
#include "hw/sh4/dyna/ssa_regalloc.h"
#define ALLOC_F64 false
class X86Compiler;
struct X86RegAlloc : RegAlloc<Xbyak::Operand::Code, s8>
struct X86RegAlloc : RegAlloc<Xbyak::Operand::Code, s8, ALLOC_F64>
{
X86RegAlloc(X86Compiler *compiler) : compiler(compiler) {}
@ -40,9 +42,9 @@ struct X86RegAlloc : RegAlloc<Xbyak::Operand::Code, s8>
return Xbyak::Reg32(ereg);
}
Xbyak::Xmm MapXRegister(const shil_param& param)
Xbyak::Xmm MapXRegister(const shil_param& param, int index = 0)
{
s8 ereg = mapf(param);
s8 ereg = mapf(param, index);
if (ereg == -1)
die("VRegister not allocated");
return Xbyak::Xmm(ereg);