diff --git a/src/ARM.cpp b/src/ARM.cpp index 1e753019..2f4aa900 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -159,7 +159,7 @@ void ARM::DoSavestate(Savestate* file) file->Var32((u32*)&Cycles); //file->Var32((u32*)&CyclesToRun); - file->Var32(&Halted); + file->Var32(&StopExecution); file->VarArray(R, 16*sizeof(u32)); file->Var32(&CPSR); @@ -632,16 +632,21 @@ void ARMv5::ExecuteJIT() NDS::ARM9Timestamp += Cycles; Cycles = 0; - if (IRQ) TriggerIRQ(); - if (Halted) + if (StopExecution) { - bool idleLoop = Halted & 0x20; - Halted &= ~0x20; - if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) { - NDS::ARM9Timestamp = NDS::ARM9Target; + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + } + break; } - break; } } @@ -769,16 +774,21 @@ void ARMv4::ExecuteJIT() Cycles = 0; // TODO optimize this shit!!! - if (IRQ) TriggerIRQ(); - if (Halted) + if (StopExecution) { - bool idleLoop = Halted & 0x20; - Halted &= ~0x20; - if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) { - NDS::ARM7Timestamp = NDS::ARM7Target; + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + } + break; } - break; } } diff --git a/src/ARM.h b/src/ARM.h index b36120a1..96dd8577 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -112,9 +112,16 @@ public: u32 Num; s32 Cycles; - u32 Halted; - - u32 IRQ; // nonzero to trigger IRQ + union + { + struct + { + u8 Halted; + u8 IRQ; // nonzero to trigger IRQ + u8 IdleLoop; + }; + u32 StopExecution; + }; u32 CodeRegion; s32 CodeCycles; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 19a5e700..0695b853 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -16,11 +16,13 @@ #include "GPU3D.h" #include "SPU.h" #include "Wifi.h" +#include "NDSCart.h" namespace ARMJIT { #define JIT_DEBUGPRINT(msg, ...) +//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) Compiler* compiler; @@ -159,13 +161,17 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr) +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, + u32& linkAddr, u32& targetAddr) { if (thumb) { u32 r15 = instr.Addr + 4; cond = 0xE; + link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG; + linkAddr = instr.Addr + 4; + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) { targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); @@ -185,9 +191,18 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA targetAddr = r15 + offset; return true; } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } } else { + link = instr.Info.Kind == ARMInstrInfo::ak_BL; + linkAddr = instr.Addr + 4; + cond = instr.Cond(); if (instr.Info.Kind == ARMInstrInfo::ak_BL || instr.Info.Kind == ARMInstrInfo::ak_B) @@ -197,6 +212,12 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA targetAddr = r15 + offset; return true; } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } } return false; } @@ -351,6 +372,8 @@ void CompileBlock(ARM* cpu) CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); u32 lastSegmentStart = blockAddr; + u32 lr; + bool hasLink = false; do { @@ -413,6 +436,9 @@ void CompileBlock(ARM* cpu) cpu->CurInstr = instrs[i].Instr; cpu->CodeCycles = instrs[i].CodeCycles; + if (instrs[i].Info.DstRegs & (1 << 14)) + hasLink = false; + if (thumb) { InterpretTHUMB[instrs[i].Info.Kind](cpu); @@ -452,8 +478,9 @@ void CompileBlock(ARM* cpu) { bool hasBranched = cpu->R[15] != r15; - u32 cond, target; - bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target); + bool link; + u32 cond, target, linkAddr; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target); JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); if (staticBranch) @@ -474,18 +501,24 @@ void CompileBlock(ARM* cpu) if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) { // we might have an idle loop - u32 offset = (target - blockAddr) / (thumb ? 2 : 4); - if (IsIdleLoop(instrs + offset, i - offset + 1)) + u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4); + if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1)) { instrs[i].BranchFlags |= branch_IdleBranch; JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); } } - else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { u32 targetPseudoPhysical = cpu->Num == 0 ? TranslateAddr<0>(target) : TranslateAddr<1>(target); + + if (link) + { + lr = linkAddr; + hasLink = true; + } r15 = target + (thumb ? 2 : 4); assert(r15 == cpu->R[15]); @@ -520,7 +553,7 @@ void CompileBlock(ARM* cpu) bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); - } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); JitBlock* prevBlock = RestoreCandidates[restoreSlot]; @@ -713,6 +746,9 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if ((addr & 0xFF000000) == 0x04000000) { + if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) + return (void*)NDSCart::ReadROMData; + /* unfortunately we can't map GPU2D this way since it's hidden inside an object diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index ed6a2b74..2222bc2b 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -93,10 +93,12 @@ public: void Prepare(bool thumb, int i) { + FetchedInstr instr = Instrs[i]; + if (LoadedRegs & (1 << 15)) UnloadRegister(15); - BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs); + BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs); for (int reg : invalidedLiterals) UnloadLiteral(reg); @@ -108,6 +110,7 @@ public: { BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); futureNeeded |= regsNeeded.m_val; + regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded); for (int reg : regsNeeded) ranking[reg]++; } @@ -117,8 +120,8 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - FetchedInstr Instr = Instrs[i]; - u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; + u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; + u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) { @@ -143,13 +146,31 @@ public: loadedSet.m_val = LoadedRegs; } + // we don't need to load a value which is always going to be overwritten BitSet16 needValueLoaded(needToBeLoaded); - if (thumb || Instr.Cond() >= 0xE) - needValueLoaded = BitSet16(Instr.Info.SrcRegs); + if (thumb || instr.Cond() >= 0xE) + needValueLoaded = BitSet16(instr.Info.SrcRegs); for (int reg : needToBeLoaded) LoadRegister(reg, needValueLoaded[reg]); + } + { + BitSet16 loadedSet(LoadedRegs); + BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs); + if (loadRegs && loadedSet.Count() < NativeRegsAvailable) + { + int left = NativeRegsAvailable - loadedSet.Count(); + for (int reg : loadRegs) + { + if (left-- == 0) + break; + + writeRegs |= (1 << reg) & instr.Info.DstRegs; + LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs); + } + } } - DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); + + DirtyRegs |= writeRegs & ~(1 << 15); } static const Reg NativeRegAllocOrder[]; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index a994d344..fd38724c 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -364,7 +364,7 @@ void Compiler::Reset() void Compiler::Comp_SpecialBranchBehaviour() { if (CurInstr.BranchFlags & branch_IdleBranch) - OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20)); + OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); if (CurInstr.BranchFlags & branch_FollowCondNotTaken) { @@ -402,6 +402,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] { CurInstr = instrs[i]; R15 = CurInstr.Addr + (Thumb ? 4 : 8); + CodeRegion = R15 >> 24; Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); @@ -571,8 +572,6 @@ void Compiler::Comp_AddCycles_CDI() Comp_AddCycles_CD(); else { - IrregularCycles = true; - s32 cycles; s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; @@ -642,7 +641,7 @@ void Compiler::Comp_AddCycles_CD() IrregularCycles = true; } - if (!Thumb && CurInstr.Cond() < 0xE) + if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index eb01c87f..37997743 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -1,5 +1,6 @@ #include "ARMJIT_Compiler.h" +#include "../Config.h" using namespace Gen; @@ -290,7 +291,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (size == 16) addressMask = ~1; - if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) { u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); Comp_MemLoadLiteral(size, rd, addr); @@ -309,6 +310,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz OpArg rdMapped = MapReg(rd); OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); bool inlinePreparation = Num == 1; u32 constLocalROR32 = 4; @@ -317,7 +320,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; - if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) { u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); @@ -749,9 +752,12 @@ void Compiler::T_Comp_MemImmHalf() void Compiler::T_Comp_LoadPCRel() { - u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); - - Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); + u32 offset = (CurInstr.Instr & 0xFF) << 2; + u32 addr = (R15 & ~0x2) + offset; + if (Config::JIT_LiteralOptimisations) + Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); + else + Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0); } void Compiler::T_Comp_MemSPRel() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 1261bbe5..8f8bd35e 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -365,6 +365,21 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ARMInstrInfo::tk_LDR_PCREL) res.SpecialKind = special_LoadLiteral; + if (res.Kind == tk_LDMIA || res.Kind == tk_POP) + { + u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + res.NotStrictlyNeeded |= set; + res.DstRegs |= set; + } + if (res.Kind == tk_STMIA || res.Kind == tk_PUSH) + { + u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + if (res.Kind == tk_PUSH && instr & (1 << 8)) + set |= (1 << 14); + res.NotStrictlyNeeded |= set; + res.SrcRegs |= set; + } + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) @@ -466,6 +481,19 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((data & A_LoadMem) && res.SrcRegs == (1 << 15)) res.SpecialKind = special_LoadLiteral; + + if (res.Kind == ak_LDM) + { + u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.DstRegs |= set; + res.NotStrictlyNeeded |= set; + } + if (res.Kind == ak_STM) + { + u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.SrcRegs |= set; + res.NotStrictlyNeeded |= set; + } if ((instr >> 28) < 0xE) { diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index c032a4fc..2732181d 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -236,7 +236,7 @@ enum struct Info { - u16 DstRegs, SrcRegs; + u16 DstRegs, SrcRegs, NotStrictlyNeeded; u16 Kind; u8 SpecialKind; diff --git a/src/Config.cpp b/src/Config.cpp index c117a413..a7d78cd5 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -41,6 +41,7 @@ char DSiNANDPath[1024]; bool JIT_Enable = false; int JIT_MaxBlockSize = 12; bool JIT_BrancheOptimisations = true; +bool JIT_LiteralOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -58,6 +59,7 @@ ConfigEntry ConfigFile[] = {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, + {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index c9013aaa..1fcd9bbc 100644 --- a/src/Config.h +++ b/src/Config.h @@ -55,6 +55,7 @@ extern char DSiNANDPath[1024]; extern bool JIT_Enable; extern int JIT_MaxBlockSize; extern bool JIT_BrancheOptimisations; +extern bool JIT_LiteralOptimisations; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 0cfbd1a8..7b6a4504 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1211,9 +1211,9 @@ void UpdateIRQ(u32 cpu) if (IME[cpu] & 0x1) { - arm->IRQ = IE[cpu] & IF[cpu]; + arm->IRQ = !!(IE[cpu] & IF[cpu]); if ((ConsoleType == 1) && cpu) - arm->IRQ |= (IE2 & IF2); + arm->IRQ |= !!(IE2 & IF2); } else {