dynarec: more accurate cycle counting. limit code duplication

arm, arm64, x86: only account for current block cycles if block is
executed. Test cycle_counter before updating it.
arm, arm64: use function for mmu block check (vaddr and FPU ops) to
reduce host code size per block.
This commit is contained in:
Flyinghead 2023-03-08 19:24:49 +01:00
parent 4eb5cd928e
commit 0b1f69bfd2
3 changed files with 186 additions and 191 deletions

View File

@ -58,9 +58,8 @@ using namespace vixl::aarch32;
Block linking
Reg alloc
r0~r4: scratch
r5,r6,r7,r10,r11: allocated
r5,r6,r7,r9,r10,r11: allocated
r8: sh4 cntx
r9: cycle counter
fpu reg alloc
d8:d15, single storage
@ -119,7 +118,8 @@ static void storeSh4Reg(Register Rt, u32 Sh4_Reg)
ass.Str(Rt, MemOperand(r8, shRegOffs));
}
const int alloc_regs[] = { 5, 6, 7, 10, 11, -1 };
const int alloc_regs[] = { 5, 6, 7, 9, 10, 11, -1 };
const int alloc_regs_mmu[] = { 5, 6, 7, 10, 11, -1 };
const int alloc_fpu[] = { 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, -1 };
@ -173,6 +173,8 @@ static const void *ngen_LinkBlock_cond_Next_stub;
static void (*ngen_FailedToFindBlock_)();
static void (*mainloop)(void *);
static void (*handleException)();
static void (*checkBlockFpu)();
static void (*checkBlockNoFpu)();
static void generate_mainloop();
@ -1991,7 +1993,7 @@ void ngen_Compile(RuntimeBlockInfo* block, bool force_checks, bool reset, bool s
block->code = (DynarecCodeEntryPtr)emit_GetCCPtr();
//reg alloc
reg.DoAlloc(block, alloc_regs, alloc_fpu);
reg.DoAlloc(block, mmu_enabled() ? alloc_regs_mmu : alloc_regs, alloc_fpu);
u8* blk_start = ass.GetCursorAddress<u8 *>();
@ -2000,90 +2002,76 @@ void ngen_Compile(RuntimeBlockInfo* block, bool force_checks, bool reset, bool s
reg.OpBegin(&block->oplist[0], 0);
// block checks
if (force_checks || mmu_enabled())
if (mmu_enabled())
{
ass.Mov(r0, block->vaddr);
ass.Mov(r1, block->addr);
if (block->has_fpu_op)
call((void *)checkBlockFpu);
else
call((void *)checkBlockNoFpu);
}
if (force_checks)
{
u32 addr = block->addr;
ass.Mov(r0, addr);
if (mmu_enabled())
{
loadSh4Reg(r2, reg_nextpc);
ass.Mov(r1, block->vaddr);
ass.Cmp(r2, r1);
jump(ngen_blockcheckfail, ne);
}
if (force_checks)
s32 sz = block->sh4_code_size;
while (sz > 0)
{
s32 sz = block->sh4_code_size;
while (sz > 0)
if (sz > 2)
{
if (sz > 2)
u32* ptr = (u32*)GetMemPtr(addr, 4);
if (ptr != nullptr)
{
u32* ptr = (u32*)GetMemPtr(addr, 4);
if (ptr != nullptr)
{
ass.Mov(r2, (u32)ptr);
ass.Ldr(r2, MemOperand(r2));
ass.Mov(r1, *ptr);
ass.Cmp(r1, r2);
ass.Mov(r2, (u32)ptr);
ass.Ldr(r2, MemOperand(r2));
ass.Mov(r1, *ptr);
ass.Cmp(r1, r2);
jump(ngen_blockcheckfail, ne);
}
addr += 4;
sz -= 4;
}
else
{
u16* ptr = (u16 *)GetMemPtr(addr, 2);
if (ptr != nullptr)
{
ass.Mov(r2, (u32)ptr);
ass.Ldrh(r2, MemOperand(r2));
ass.Mov(r1, *ptr);
ass.Cmp(r1, r2);
jump(ngen_blockcheckfail, ne);
}
addr += 2;
sz -= 2;
jump(ngen_blockcheckfail, ne);
}
addr += 4;
sz -= 4;
}
}
if (mmu_enabled() && block->has_fpu_op)
{
Label fpu_enabled;
loadSh4Reg(r1, reg_sr_status);
ass.Tst(r1, 1 << 15); // test SR.FD bit
ass.B(eq, &fpu_enabled);
else
{
u16* ptr = (u16 *)GetMemPtr(addr, 2);
if (ptr != nullptr)
{
ass.Mov(r2, (u32)ptr);
ass.Ldrh(r2, MemOperand(r2));
ass.Mov(r1, *ptr);
ass.Cmp(r1, r2);
ass.Mov(r0, block->vaddr); // pc
ass.Mov(r1, Sh4Ex_FpuDisabled);// exception code
call((void *)Do_Exception);
loadSh4Reg(r4, reg_nextpc);
jump(no_update);
ass.Bind(&fpu_enabled);
jump(ngen_blockcheckfail, ne);
}
addr += 2;
sz -= 2;
}
}
}
//scheduler
u32 cyc = block->guest_cycles;
if (!ImmediateA32::IsImmediateA32(cyc))
cyc &= ~3;
if (!mmu_enabled())
ass.Ldr(r1, MemOperand(r8, rcbOffset(cntx.cycle_counter)));
ass.Cmp(r1, 0);
Label cyclesRemaining;
ass.B(pl, &cyclesRemaining);
ass.Mov(r0, block->vaddr);
call(intc_sched);
ass.Mov(r1, r0);
ass.Bind(&cyclesRemaining);
const u32 cycles = block->guest_cycles;
if (!ImmediateA32::IsImmediateA32(cycles))
{
ass.Sub(SetFlags, r9, r9, cyc);
ass.Sub(r1, r1, cycles & ~3);
ass.Sub(r1, r1, cycles & 3);
}
else
{
ass.Ldr(r0, MemOperand(r8, rcbOffset(cntx.cycle_counter)));
ass.Sub(SetFlags, r0, r0, cyc);
ass.Str(r0, MemOperand(r8, rcbOffset(cntx.cycle_counter)));
// FIXME condition?
ass.Mov(r4, block->vaddr);
storeSh4Reg(r4, reg_nextpc);
ass.Sub(r1, r1, cycles);
}
call(intc_sched, le);
ass.Str(r1, MemOperand(r8, rcbOffset(cntx.cycle_counter)));
//compile the block's opcodes
shil_opcode* op;
@ -2229,8 +2217,6 @@ static void generate_mainloop()
{
// r8: context
ass.Mov(r8, r0);
// r9: cycle counter
ass.Ldr(r9, MemOperand(r0, rcbOffset(cntx.cycle_counter)));
}
else
{
@ -2253,29 +2239,24 @@ static void generate_mainloop()
// this code is here for fall-through behavior of do_iter
Label do_iter;
Label cleanup;
// intc_sched:
// intc_sched: r0 is pc, r1 is cycle_counter
intc_sched = ass.GetCursorAddress<const void *>();
if (!mmu_enabled())
ass.Add(r9, r9, SH4_TIMESLICE);
else
{
ass.Ldr(r0, MemOperand(r8, rcbOffset(cntx.cycle_counter)));
ass.Add(r0, r0, SH4_TIMESLICE);
ass.Str(r0, MemOperand(r8, rcbOffset(cntx.cycle_counter)));
}
ass.Mov(r4, lr);
call((void *)UpdateSystem);
ass.Mov(lr, r4);
ass.Cmp(r0, 0);
ass.B(ne, &do_iter);
ass.Add(r1, r1, SH4_TIMESLICE);
ass.Str(r1, MemOperand(r8, rcbOffset(cntx.cycle_counter)));
ass.Str(r0, MemOperand(r8, rcbOffset(cntx.pc)));
ass.Ldr(r0, MemOperand(r8, rcbOffset(cntx.CpuRunning)));
ass.Cmp(r0, 0);
ass.Bx(ne, lr);
// do_iter:
ass.B(eq, &cleanup);
ass.Mov(r4, lr);
call((void *)UpdateSystem_INTC);
ass.Cmp(r0, 0);
ass.B(ne, &do_iter);
ass.Mov(lr, r4);
ass.Ldr(r0, MemOperand(r8, rcbOffset(cntx.cycle_counter)));
ass.Bx(lr);
// do_iter:
ass.Bind(&do_iter);
ass.Mov(r0, r4);
call((void *)rdv_DoInterrupts);
ass.Mov(r4, r0);
ass.Ldr(r4, MemOperand(r8, rcbOffset(cntx.pc)));
// no_update:
no_update = ass.GetCursorAddress<const void *>();
@ -2303,8 +2284,6 @@ static void generate_mainloop()
ass.Bind(&cleanup);
if (mmu_enabled())
ass.Add(sp, sp, 8); // pop context & alignment
else
ass.Str(r9, MemOperand(r8, rcbOffset(cntx.cycle_counter)));
{
UseScratchRegisterScope scope(&ass);
scope.ExcludeAll();
@ -2322,6 +2301,29 @@ static void generate_mainloop()
ass.B(&longjumpLabel);
}
// MMU Check block (with fpu)
// r0: vaddr, r1: addr
checkBlockFpu = ass.GetCursorAddress<void (*)()>();
Label fpu_enabled;
loadSh4Reg(r2, reg_sr_status);
ass.Tst(r2, 1 << 15); // test SR.FD bit
ass.B(eq, &fpu_enabled);
ass.Mov(r1, Sh4Ex_FpuDisabled); // exception code
call((void *)Do_Exception);
loadSh4Reg(r4, reg_nextpc);
ass.B(&no_updateLabel);
ass.Bind(&fpu_enabled);
// fallthrough
// MMU Check block (no fpu)
// r0: vaddr, r1: addr
checkBlockNoFpu = ass.GetCursorAddress<void (*)()>();
loadSh4Reg(r2, reg_nextpc);
ass.Cmp(r2, r0);
ass.Mov(r0, r1);
jump(ngen_blockcheckfail, ne);
ass.Bx(lr);
// Memory handlers
for (int s=0;s<6;s++)
{

View File

@ -62,6 +62,8 @@ struct DynaCode;
static DynaCode *arm64_intc_sched;
static DynaCode *arm64_no_update;
static DynaCode *blockCheckFail;
static DynaCode *checkBlockNoFpu;
static DynaCode *checkBlockFpu;
static DynaCode *linkBlockGenericStub;
static DynaCode *linkBlockBranchStub;
static DynaCode *linkBlockNextStub;
@ -294,27 +296,18 @@ public:
regalloc.DoAlloc(block);
// scheduler
if (mmu_enabled())
{
Ldr(w0, sh4_context_mem_operand(&Sh4cntx.cycle_counter));
Subs(w0, w0, block->guest_cycles);
Str(w0, sh4_context_mem_operand(&Sh4cntx.cycle_counter));
}
else
{
Subs(w27, w27, block->guest_cycles);
}
Ldr(w1, sh4_context_mem_operand(&Sh4cntx.cycle_counter));
Cmp(w1, 0);
Label cycles_remaining;
B(&cycles_remaining, pl);
Mov(w0, block->vaddr);
GenCall(arm64_intc_sched);
Label cpu_running;
Cbnz(w0, &cpu_running);
Mov(w29, block->vaddr);
Str(w29, sh4_context_mem_operand(&next_pc));
GenBranch(arm64_no_update);
Bind(&cpu_running);
Mov(w1, w0);
Bind(&cycles_remaining);
Sub(w1, w1, block->guest_cycles);
Str(w1, sh4_context_mem_operand(&Sh4cntx.cycle_counter));
for (size_t i = 0; i < block->oplist.size(); i++)
{
shil_opcode& op = block->oplist[i];
@ -1359,7 +1352,7 @@ public:
Label intc_sched;
Label end_mainloop;
// int intc_sched()
// int intc_sched(int pc, int cycle_counter)
arm64_intc_sched = GetCursorAddress<DynaCode *>();
verify((void *)arm64_intc_sched == (void *)CodeCache);
B(&intc_sched);
@ -1461,8 +1454,6 @@ public:
{
// Use x28 as sh4 context pointer
Mov(x28, x0);
// Use x27 as cycle_counter
Ldr(w27, sh4_context_mem_operand(&Sh4cntx.cycle_counter));
}
Label do_interrupts;
@ -1470,40 +1461,29 @@ public:
Ldr(w29, MemOperand(x28, offsetof(Sh4Context, pc)));
B(&no_update);
Bind(&intc_sched);
Bind(&intc_sched); // w0 is pc, w1 is cycle_counter
Str(w0, sh4_context_mem_operand(&Sh4cntx.pc));
// Add timeslice to cycle counter
if (!mmu_enabled())
{
Add(w27, w27, SH4_TIMESLICE);
}
else
{
Ldr(w0, sh4_context_mem_operand(&Sh4cntx.cycle_counter));
Add(w0, w0, SH4_TIMESLICE);
Str(w0, sh4_context_mem_operand(&Sh4cntx.cycle_counter));
}
Mov(x29, lr); // Trashing pc here but it will be reset at the end of the block or in DoInterrupts
GenCallRuntime(UpdateSystem);
Mov(lr, x29);
Add(w1, w1, SH4_TIMESLICE);
Str(w1, sh4_context_mem_operand(&Sh4cntx.cycle_counter));
Ldr(w0, sh4_context_mem_operand(&Sh4cntx.CpuRunning));
Cbz(w0, &end_mainloop);
Mov(x29, lr); // Save link register in case we return
GenCallRuntime(UpdateSystem_INTC);
Cbnz(w0, &do_interrupts);
Ldr(w0, MemOperand(x28, offsetof(Sh4Context, CpuRunning)));
Mov(lr, x29);
Ldr(w0, sh4_context_mem_operand(&Sh4cntx.cycle_counter));
Ret();
Bind(&do_interrupts);
Mov(x0, x29);
GenCallRuntime(rdv_DoInterrupts); // Updates next_pc based on host pc
Mov(w29, w0);
Ldr(w29, sh4_context_mem_operand(&Sh4cntx.pc));
B(&no_update);
Bind(&end_mainloop);
if (mmu_enabled())
// Pop context
Add(sp, sp, 16);
else
// save cycle counter
Str(w27, sh4_context_mem_operand(&Sh4cntx.cycle_counter));
// Restore registers
Ldp(x29, x30, MemOperand(sp, 144));
Ldp(d12, d13, MemOperand(sp, 128));
@ -1528,8 +1508,33 @@ public:
B(&reenterLabel);
}
// MMU Block check (with fpu)
// w0: vaddr, w1: addr
checkBlockFpu = GetCursorAddress<DynaCode *>();
Label fpu_enabled;
Ldr(w10, sh4_context_mem_operand(&sr));
Tbz(w10, 15, &fpu_enabled); // test SR.FD bit
Mov(w1, Sh4Ex_FpuDisabled); // exception code
GenCallRuntime(Do_Exception);
Ldr(w29, sh4_context_mem_operand(&next_pc));
B(&no_update);
Bind(&fpu_enabled);
// fallthrough
Label blockCheckFailLabel;
// MMU Block check (no fpu)
// w0: vaddr, w1: addr
checkBlockNoFpu = GetCursorAddress<DynaCode *>();
Ldr(w2, sh4_context_mem_operand(&Sh4cntx.pc));
Cmp(w2, w0);
Mov(w0, w1);
B(&blockCheckFailLabel, ne);
Ret();
// Block check fail
blockCheckFail = GetCursorAddress<DynaCode *>();
// w0: addr
Bind(&blockCheckFailLabel);
GenCallRuntime(rdv_BlockCheckFail);
if (mmu_enabled())
{
@ -1588,6 +1593,7 @@ public:
arm64_no_update = GetLabelAddress<DynaCode *>(&no_update);
handleException = (void (*)())CC_RW2RX(GetLabelAddress<uintptr_t>(&handleExceptionLabel));
blockCheckFail = GetLabelAddress<DynaCode *>(&blockCheckFailLabel);
writeStoreQueue32 = GetLabelAddress<DynaCode *>(&writeStoreQueue32Label);
writeStoreQueue64 = GetLabelAddress<DynaCode *>(&writeStoreQueue64Label);
@ -2037,56 +2043,56 @@ private:
void CheckBlock(bool force_checks, RuntimeBlockInfo* block)
{
if (!mmu_enabled() && !force_checks)
if (mmu_enabled())
{
Mov(w0, block->vaddr);
Mov(w1, block->addr);
if (block->has_fpu_op)
GenCall(checkBlockFpu);
else
GenCall(checkBlockNoFpu);
}
if (!force_checks)
return;
Label blockcheck_fail;
s32 sz = block->sh4_code_size;
u8* ptr = GetMemPtr(block->addr, sz);
if (ptr != NULL)
{
Ldr(x9, reinterpret_cast<uintptr_t>(ptr));
if (mmu_enabled())
{
Ldr(w10, sh4_context_mem_operand(&next_pc));
Ldr(w11, block->vaddr);
Cmp(w10, w11);
B(ne, &blockcheck_fail);
}
if (force_checks)
{
s32 sz = block->sh4_code_size;
u8* ptr = GetMemPtr(block->addr, sz);
if (ptr != NULL)
while (sz > 0)
{
Ldr(x9, reinterpret_cast<uintptr_t>(ptr));
while (sz > 0)
if (sz >= 8)
{
if (sz >= 8)
{
Ldr(x10, MemOperand(x9, 8, PostIndex));
Ldr(x11, *(u64*)ptr);
Cmp(x10, x11);
sz -= 8;
ptr += 8;
}
else if (sz >= 4)
{
Ldr(w10, MemOperand(x9, 4, PostIndex));
Ldr(w11, *(u32*)ptr);
Cmp(w10, w11);
sz -= 4;
ptr += 4;
}
else
{
Ldrh(w10, MemOperand(x9, 2, PostIndex));
Mov(w11, *(u16*)ptr);
Cmp(w10, w11);
sz -= 2;
ptr += 2;
}
B(ne, &blockcheck_fail);
Ldr(x10, MemOperand(x9, 8, PostIndex));
Ldr(x11, *(u64*)ptr);
Cmp(x10, x11);
sz -= 8;
ptr += 8;
}
else if (sz >= 4)
{
Ldr(w10, MemOperand(x9, 4, PostIndex));
Ldr(w11, *(u32*)ptr);
Cmp(w10, w11);
sz -= 4;
ptr += 4;
}
else
{
Ldrh(w10, MemOperand(x9, 2, PostIndex));
Mov(w11, *(u16*)ptr);
Cmp(w10, w11);
sz -= 2;
ptr += 2;
}
B(ne, &blockcheck_fail);
}
}
Label blockcheck_success;
B(&blockcheck_success);
Bind(&blockcheck_fail);
@ -2094,21 +2100,6 @@ private:
GenBranch(blockCheckFail);
Bind(&blockcheck_success);
if (mmu_enabled() && block->has_fpu_op)
{
Label fpu_enabled;
Ldr(w10, sh4_context_mem_operand(&sr));
Tbz(w10, 15, &fpu_enabled); // test SR.FD bit
Mov(w0, block->vaddr); // pc
Mov(w1, Sh4Ex_FpuDisabled);// exception code
CallRuntime(Do_Exception);
Ldr(w29, sh4_context_mem_operand(&next_pc));
GenBranch(arm64_no_update);
Bind(&fpu_enabled);
}
}
void shil_param_to_host_reg(const shil_param& param, const Register& reg)

View File

@ -126,11 +126,13 @@ void X86Compiler::compile(RuntimeBlockInfo* block, bool force_checks, bool optim
L(fpu_enabled);
}
sub(dword[&Sh4cntx.cycle_counter], block->guest_cycles);
mov(eax, dword[&Sh4cntx.cycle_counter]);
test(eax, eax);
Xbyak::Label no_up;
jns(no_up);
jg(no_up);
call((const void *)intc_sched);
L(no_up);
sub(dword[&Sh4cntx.cycle_counter], block->guest_cycles);
regalloc.doAlloc(block);