From 2d8bc6d6ee2d34d89f2e63611debb86a5cd0f962 Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Sun, 14 Feb 2021 18:49:40 +0100 Subject: [PATCH] dynarec: skip single branch targets --- core/hw/sh4/dyna/decoder.cpp | 118 +++++++++++++++++++++-------------- core/hw/sh4/dyna/decoder.h | 8 ++- core/hw/sh4/dyna/driver.cpp | 4 +- core/hw/sh4/dyna/ssa.cpp | 2 +- core/hw/sh4/dyna/ssa.h | 45 +++++++++++++ 5 files changed, 124 insertions(+), 53 deletions(-) diff --git a/core/hw/sh4/dyna/decoder.cpp b/core/hw/sh4/dyna/decoder.cpp index aa0fb976e..032d4401f 100644 --- a/core/hw/sh4/dyna/decoder.cpp +++ b/core/hw/sh4/dyna/decoder.cpp @@ -13,6 +13,7 @@ #include "hw/sh4/sh4_opcode_list.h" #include "hw/sh4/sh4_core.h" #include "hw/sh4/sh4_mem.h" +#include "hw/sh4/modules/mmu.h" #include "decoder_opcodes.h" #define BLOCK_MAX_SH_OPS_SOFT 500 @@ -101,13 +102,13 @@ static void dec_DynamicSet(u32 regbase,u32 offs=0) Emit(shop_jdyn,reg_pc_dyn,mk_reg((Sh4RegType)regbase),mk_imm(offs)); } -static void dec_End(u32 dst,BlockEndType flags,bool delay) +static void dec_End(u32 dst, BlockEndType flags, bool delaySlot) { if (state.ngen.OnlyDynamicEnds && flags == BET_StaticJump) { - Emit(shop_mov32,mk_reg(reg_nextpc),mk_imm(dst)); + Emit(shop_mov32, mk_reg(reg_nextpc), mk_imm(dst)); dec_DynamicSet(reg_nextpc); - dec_End(0xFFFFFFFF,BET_DynamicJump,delay); + dec_End(NullAddress, BET_DynamicJump, delaySlot); return; } @@ -116,11 +117,14 @@ static void dec_End(u32 dst,BlockEndType flags,bool delay) verify(flags == BET_DynamicJump); } - state.BlockType=flags; - state.NextOp=delay?NDO_Delayslot:NDO_End; - state.DelayOp=NDO_End; - state.JumpAddr=dst; - state.NextAddr=state.cpu.rpc+2+(delay?2:0); + state.BlockType = flags; + state.NextOp = delaySlot ? NDO_Delayslot : NDO_End; + state.DelayOp = NDO_End; + state.JumpAddr = dst; + if (flags != BET_StaticCall && flags != BET_StaticJump) + state.NextAddr = state.cpu.rpc + 2 + (delaySlot ? 2 : 0); + else + verify(state.JumpAddr != NullAddress); } #define GetN(str) ((str>>8) & 0xf) @@ -128,9 +132,6 @@ static void dec_End(u32 dst,BlockEndType flags,bool delay) #define GetImm4(str) ((str>>0) & 0xf) #define GetImm8(str) ((str>>0) & 0xff) #define GetSImm8(str) ((s8)((str>>0) & 0xff)) -#define GetImm12(str) ((str>>0) & 0xfff) -#define GetSImm12(str) (((s16)((GetImm12(str))<<4))>>4) - #define SR_STATUS_MASK 0x700083F2 #define SR_T_MASK 1 @@ -189,7 +190,7 @@ sh4dec(i0000_nnnn_0010_0011) u32 n = GetN(op); dec_DynamicSet(reg_r0+n,state.cpu.rpc + 4); - dec_End(0xFFFFFFFF,BET_DynamicJump,true); + dec_End(NullAddress, BET_DynamicJump, true); } //jmp @ sh4dec(i0100_nnnn_0010_1011) @@ -197,39 +198,36 @@ sh4dec(i0100_nnnn_0010_1011) u32 n = GetN(op); dec_DynamicSet(reg_r0+n); - dec_End(0xFFFFFFFF,BET_DynamicJump,true); + dec_End(NullAddress, BET_DynamicJump, true); } //bsr sh4dec(i1011_iiii_iiii_iiii) { - //TODO: set PR dec_set_pr(); - dec_End(dec_jump_simm12(op),BET_StaticCall,true); + dec_End(dec_jump_simm12(op), BET_StaticCall, true); } //bsrf sh4dec(i0000_nnnn_0000_0011) { u32 n = GetN(op); - //TODO: set PR u32 retaddr=dec_set_pr(); dec_DynamicSet(reg_r0+n,retaddr); - dec_End(0xFFFFFFFF,BET_DynamicCall,true); + dec_End(NullAddress, BET_DynamicCall, true); } //jsr @ sh4dec(i0100_nnnn_0000_1011) { u32 n = GetN(op); - //TODO: Set pr dec_set_pr(); dec_DynamicSet(reg_r0+n); - dec_End(0xFFFFFFFF,BET_DynamicCall,true); + dec_End(NullAddress, BET_DynamicCall, true); } //rts sh4dec(i0000_0000_0000_1011) { dec_DynamicSet(reg_pr); - dec_End(0xFFFFFFFF,BET_DynamicRet,true); + dec_End(NullAddress, BET_DynamicRet, true); } //rte sh4dec(i0000_0000_0010_1011) @@ -238,7 +236,7 @@ sh4dec(i0000_0000_0010_1011) dec_write_sr(reg_ssr); Emit(shop_sync_sr); dec_DynamicSet(reg_spc); - dec_End(0xFFFFFFFF,BET_DynamicIntr,true); + dec_End(NullAddress, BET_DynamicIntr, true); } //trapa # sh4dec(i1100_0011_iiii_iiii) @@ -246,7 +244,7 @@ sh4dec(i1100_0011_iiii_iiii) //TODO: ifb dec_fallback(op); dec_DynamicSet(reg_nextpc); - dec_End(0xFFFFFFFF,BET_DynamicJump,false); + dec_End(NullAddress, BET_DynamicJump, false); } //sleep sh4dec(i0000_0000_0001_1011) @@ -254,7 +252,7 @@ sh4dec(i0000_0000_0001_1011) //TODO: ifb dec_fallback(op); dec_DynamicSet(reg_nextpc); - dec_End(0xFFFFFFFF,BET_DynamicJump,false); + dec_End(NullAddress, BET_DynamicJump, false); } //ldc.l @+,SR @@ -272,7 +270,7 @@ sh4dec(i0100_nnnn_0000_0111) //FIXME only if interrupts got on .. :P UpdateINTC(); } - dec_End(0xFFFFFFFF,BET_StaticIntr,false); + dec_End(NullAddress,BET_StaticIntr,false); } */ @@ -283,7 +281,7 @@ sh4dec(i0100_nnnn_0000_1110) dec_write_sr((Sh4RegType)(reg_r0+n)); Emit(shop_sync_sr); - dec_End(0xFFFFFFFF,BET_StaticIntr,false); + dec_End(NullAddress, BET_StaticIntr, false); } //nop ! @@ -975,16 +973,29 @@ static void state_Setup(u32 rpc,fpscr_t fpu_cfg) //verify(fpu_cfg.RM<2); // Happens with many wince games (set to 3) //what about fp/fs ? - state.NextOp=NDO_NextOp; - state.BlockType=BET_SCL_Intr; - state.JumpAddr=0xFFFFFFFF; - state.NextAddr=0xFFFFFFFF; + state.NextOp = NDO_NextOp; + state.BlockType = BET_SCL_Intr; + state.JumpAddr = NullAddress; + state.NextAddr = NullAddress; state.info.has_readm=false; state.info.has_writem=false; state.info.has_fpu=false; } +void dec_updateBlockCycles(RuntimeBlockInfo *block, u16 op) +{ + if (!mmu_enabled()) + { + if (op < 0xF000) + block->guest_cycles++; + } + else + { + block->guest_cycles += std::max((int)OpDesc[op]->LatencyCycles, 1); + } +} + bool dec_DecodeBlock(RuntimeBlockInfo* rbi,u32 max_cycles) { blk=rbi; @@ -1015,15 +1026,8 @@ bool dec_DecodeBlock(RuntimeBlockInfo* rbi,u32 max_cycles) u32 op = IReadMem16(state.cpu.rpc); blk->guest_opcodes++; - if (!mmu_enabled()) - { - if (op < 0xF000) - blk->guest_cycles++; - } - else - { - blk->guest_cycles += std::max((int)OpDesc[op]->LatencyCycles, 1); - } + dec_updateBlockCycles(blk, op); + if (OpDesc[op]->IsFloatingPoint()) { if (sr.FD == 1) @@ -1045,11 +1049,11 @@ bool dec_DecodeBlock(RuntimeBlockInfo* rbi,u32 max_cycles) if (OpDesc[op]->SetPC()) { dec_DynamicSet(reg_nextpc); - dec_End(0xFFFFFFFF,BET_DynamicJump,false); + dec_End(NullAddress, BET_DynamicJump, false); } - if (OpDesc[op]->SetFPSCR() && !state.cpu.is_delayslot) + else if (OpDesc[op]->SetFPSCR() && !state.cpu.is_delayslot) { - dec_End(state.cpu.rpc+2,BET_StaticJump,false); + dec_End(state.cpu.rpc + 2, BET_StaticJump, false); } } } @@ -1062,13 +1066,33 @@ bool dec_DecodeBlock(RuntimeBlockInfo* rbi,u32 max_cycles) } break; - case NDO_Jump: - die("Too old"); - //state.NextOp=state.JumpOp; - //state.cpu.rpc=state.JumpAddr; - break; - case NDO_End: + // Disabled for now since we need to know if the block is read-only, + // which isn't determined until after the decoding. + // This is a relatively rare optimization anyway +#if 0 + // detect if calling an empty subroutine and skip it + if (state.BlockType == BET_StaticCall && blk->read_only) + { + if ((state.JumpAddr >> 12) == (blk->vaddr >> 12) + || (state.JumpAddr >> 12) == ((blk->vaddr + (blk->guest_opcodes - 1) * 2) >> 12)) + { + u32 op = IReadMem16(state.JumpAddr); + if (op == 0x000B) // rts + { + u16 delayOp = IReadMem16(state.JumpAddr + 2); + if (delayOp == 0x0000 || delayOp == 0x0009) // nop + { + state.NextOp = NDO_NextOp; + state.cpu.is_delayslot = false; + dec_updateBlockCycles(blk, op); + dec_updateBlockCycles(blk, delayOp); + continue; + } + } + } + } +#endif goto _end; } } diff --git a/core/hw/sh4/dyna/decoder.h b/core/hw/sh4/dyna/decoder.h index 4fb843f47..5398e14d8 100644 --- a/core/hw/sh4/dyna/decoder.h +++ b/core/hw/sh4/dyna/decoder.h @@ -35,7 +35,6 @@ enum NextDecoderOperation NDO_NextOp, //pc+=2 NDO_End, //End the block, Type = BlockEndType NDO_Delayslot, //pc+=2, NextOp=DelayOp - NDO_Jump, //pc=JumpAddr,NextOp=JumpOp }; //ngen features struct ngen_features @@ -46,12 +45,12 @@ struct ngen_features struct RuntimeBlockInfo; bool dec_DecodeBlock(RuntimeBlockInfo* rbi,u32 max_cycles); +void dec_updateBlockCycles(RuntimeBlockInfo *block, u16 op); struct state_t { NextDecoderOperation NextOp; NextDecoderOperation DelayOp; - NextDecoderOperation JumpOp; u32 JumpAddr; u32 NextAddr; BlockEndType BlockType; @@ -73,5 +72,8 @@ struct state_t bool has_writem; bool has_fpu; } info; +}; -} ; +const u32 NullAddress = 0xFFFFFFFF; +#define GetImm12(str) ((str>>0) & 0xfff) +#define GetSImm12(str) (((short)((GetImm12(str))<<4))>>4) diff --git a/core/hw/sh4/dyna/driver.cpp b/core/hw/sh4/dyna/driver.cpp index 31f0cdcbd..e9d290112 100644 --- a/core/hw/sh4/dyna/driver.cpp +++ b/core/hw/sh4/dyna/driver.cpp @@ -149,8 +149,8 @@ bool RuntimeBlockInfo::Setup(u32 rpc,fpscr_t rfpu_cfg) pBranchBlock=pNextBlock=0; code=0; has_jcond=false; - BranchBlock = 0xFFFFFFFF; - NextBlock = 0xFFFFFFFF; + BranchBlock = NullAddress; + NextBlock = NullAddress; BlockType = BET_SCL_Intr; has_fpu_op = false; temp_block = false; diff --git a/core/hw/sh4/dyna/ssa.cpp b/core/hw/sh4/dyna/ssa.cpp index 8baee7160..9288e62ff 100644 --- a/core/hw/sh4/dyna/ssa.cpp +++ b/core/hw/sh4/dyna/ssa.cpp @@ -249,7 +249,7 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op) block->BranchBlock = block->NextBlock; } block->BlockType = BET_StaticJump; - block->NextBlock = 0xFFFFFFFF; + block->NextBlock = NullAddress; block->has_jcond = false; // same remark regarding jdyn as in the previous case block->oplist.erase(block->oplist.begin() + opnum); diff --git a/core/hw/sh4/dyna/ssa.h b/core/hw/sh4/dyna/ssa.h index 73daff95c..e27b35e60 100644 --- a/core/hw/sh4/dyna/ssa.h +++ b/core/hw/sh4/dyna/ssa.h @@ -50,6 +50,7 @@ public: CombineShiftsPass(); DeadRegisterPass(); IdentityMovePass(); + SingleBranchTargetPass(); #if DEBUG if (stats.prop_constants > 0 || stats.dead_code_ops > 0 || stats.constant_ops_replaced > 0 @@ -714,6 +715,50 @@ private: } } + bool skipSingleBranchTarget(u32& addr, bool updateCycles) + { + if (addr == NullAddress) + return false; + bool success = false; + while (true) + { + if ((addr >> 12) != (block->vaddr >> 12) + && (addr >> 12) != ((block->vaddr + (block->guest_opcodes - 1) * 2) >> 12)) + break; + + u32 op = IReadMem16(addr); + // Axxx: bra + if ((op & 0xF000) != 0xA000) + break; + + u16 delayOp = IReadMem16(addr + 2); + if (delayOp != 0x0000 && delayOp != 0x0009) // nop + break; + + int disp = GetSImm12(op) * 2 + 4; + if (disp == 0) + // infiniloop + break; + addr += disp; + if (updateCycles) + { + dec_updateBlockCycles(block, op); + dec_updateBlockCycles(block, delayOp); + } + success = true; + } + return success; + } + + void SingleBranchTargetPass() + { + if (block->read_only) + { + bool updateCycles = !skipSingleBranchTarget(block->BranchBlock, true); + skipSingleBranchTarget(block->NextBlock, updateCycles); + } + } + RuntimeBlockInfo* block; std::set writeback_values;