integrate changes from ARM64 backend and more

- better handle LDM/STM in reg alloc
- unify Halted and IRQ in anticipation for branch inlining
- literal optimisations can be disabled in gui
- jit blocks follow simple returns
- fix idle loop detection
- break jit blocks on IRQ (fixes saving in Pokemon White)
This commit is contained in:
RSDuck 2019-10-18 13:29:17 +02:00
parent 9cf7780e46
commit 441869a105
11 changed files with 153 additions and 43 deletions

View File

@ -159,7 +159,7 @@ void ARM::DoSavestate(Savestate* file)
file->Var32((u32*)&Cycles);
//file->Var32((u32*)&CyclesToRun);
file->Var32(&Halted);
file->Var32(&StopExecution);
file->VarArray(R, 16*sizeof(u32));
file->Var32(&CPSR);
@ -632,16 +632,21 @@ void ARMv5::ExecuteJIT()
NDS::ARM9Timestamp += Cycles;
Cycles = 0;
if (IRQ) TriggerIRQ();
if (Halted)
if (StopExecution)
{
bool idleLoop = Halted & 0x20;
Halted &= ~0x20;
if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
if (IRQ)
TriggerIRQ();
if (Halted || IdleLoop)
{
NDS::ARM9Timestamp = NDS::ARM9Target;
bool idleLoop = IdleLoop;
IdleLoop = 0;
if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
{
NDS::ARM9Timestamp = NDS::ARM9Target;
}
break;
}
break;
}
}
@ -769,16 +774,21 @@ void ARMv4::ExecuteJIT()
Cycles = 0;
// TODO optimize this shit!!!
if (IRQ) TriggerIRQ();
if (Halted)
if (StopExecution)
{
bool idleLoop = Halted & 0x20;
Halted &= ~0x20;
if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
if (IRQ)
TriggerIRQ();
if (Halted || IdleLoop)
{
NDS::ARM7Timestamp = NDS::ARM7Target;
bool idleLoop = IdleLoop;
IdleLoop = 0;
if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
{
NDS::ARM7Timestamp = NDS::ARM7Target;
}
break;
}
break;
}
}

View File

@ -112,9 +112,16 @@ public:
u32 Num;
s32 Cycles;
u32 Halted;
u32 IRQ; // nonzero to trigger IRQ
union
{
struct
{
u8 Halted;
u8 IRQ; // nonzero to trigger IRQ
u8 IdleLoop;
};
u32 StopExecution;
};
u32 CodeRegion;
s32 CodeCycles;

View File

@ -16,11 +16,13 @@
#include "GPU3D.h"
#include "SPU.h"
#include "Wifi.h"
#include "NDSCart.h"
namespace ARMJIT
{
#define JIT_DEBUGPRINT(msg, ...)
//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__)
Compiler* compiler;
@ -159,13 +161,17 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
}
}
bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr)
bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link,
u32& linkAddr, u32& targetAddr)
{
if (thumb)
{
u32 r15 = instr.Addr + 4;
cond = 0xE;
link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG;
linkAddr = instr.Addr + 4;
if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
{
targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
@ -185,9 +191,18 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA
targetAddr = r15 + offset;
return true;
}
else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14)
{
JIT_DEBUGPRINT("returning!\n");
targetAddr = lr;
return true;
}
}
else
{
link = instr.Info.Kind == ARMInstrInfo::ak_BL;
linkAddr = instr.Addr + 4;
cond = instr.Cond();
if (instr.Info.Kind == ARMInstrInfo::ak_BL
|| instr.Info.Kind == ARMInstrInfo::ak_B)
@ -197,6 +212,12 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA
targetAddr = r15 + offset;
return true;
}
else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14)
{
JIT_DEBUGPRINT("returning!\n");
targetAddr = lr;
return true;
}
}
return false;
}
@ -351,6 +372,8 @@ void CompileBlock(ARM* cpu)
CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
u32 lastSegmentStart = blockAddr;
u32 lr;
bool hasLink = false;
do
{
@ -413,6 +436,9 @@ void CompileBlock(ARM* cpu)
cpu->CurInstr = instrs[i].Instr;
cpu->CodeCycles = instrs[i].CodeCycles;
if (instrs[i].Info.DstRegs & (1 << 14))
hasLink = false;
if (thumb)
{
InterpretTHUMB[instrs[i].Info.Kind](cpu);
@ -452,8 +478,9 @@ void CompileBlock(ARM* cpu)
{
bool hasBranched = cpu->R[15] != r15;
u32 cond, target;
bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target);
bool link;
u32 cond, target, linkAddr;
bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target);
JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
if (staticBranch)
@ -474,18 +501,24 @@ void CompileBlock(ARM* cpu)
if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
{
// we might have an idle loop
u32 offset = (target - blockAddr) / (thumb ? 2 : 4);
if (IsIdleLoop(instrs + offset, i - offset + 1))
u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4);
if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1))
{
instrs[i].BranchFlags |= branch_IdleBranch;
JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
}
}
else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
{
u32 targetPseudoPhysical = cpu->Num == 0
? TranslateAddr<0>(target)
: TranslateAddr<1>(target);
if (link)
{
lr = linkAddr;
hasLink = true;
}
r15 = target + (thumb ? 2 : 4);
assert(r15 == cpu->R[15]);
@ -520,7 +553,7 @@ void CompileBlock(ARM* cpu)
bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
} while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted);
} while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
JitBlock* prevBlock = RestoreCandidates[restoreSlot];
@ -713,6 +746,9 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
{
if ((addr & 0xFF000000) == 0x04000000)
{
if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
return (void*)NDSCart::ReadROMData;
/*
unfortunately we can't map GPU2D this way
since it's hidden inside an object

View File

@ -93,10 +93,12 @@ public:
void Prepare(bool thumb, int i)
{
FetchedInstr instr = Instrs[i];
if (LoadedRegs & (1 << 15))
UnloadRegister(15);
BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs);
BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs);
for (int reg : invalidedLiterals)
UnloadLiteral(reg);
@ -108,6 +110,7 @@ public:
{
BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
futureNeeded |= regsNeeded.m_val;
regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded);
for (int reg : regsNeeded)
ranking[reg]++;
}
@ -117,8 +120,8 @@ public:
for (int reg : neverNeededAgain)
UnloadRegister(reg);
FetchedInstr Instr = Instrs[i];
u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded;
BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
if (needToBeLoaded != BitSet16(0))
{
@ -143,13 +146,31 @@ public:
loadedSet.m_val = LoadedRegs;
}
// we don't need to load a value which is always going to be overwritten
BitSet16 needValueLoaded(needToBeLoaded);
if (thumb || Instr.Cond() >= 0xE)
needValueLoaded = BitSet16(Instr.Info.SrcRegs);
if (thumb || instr.Cond() >= 0xE)
needValueLoaded = BitSet16(instr.Info.SrcRegs);
for (int reg : needToBeLoaded)
LoadRegister(reg, needValueLoaded[reg]);
}
{
BitSet16 loadedSet(LoadedRegs);
BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs);
if (loadRegs && loadedSet.Count() < NativeRegsAvailable)
{
int left = NativeRegsAvailable - loadedSet.Count();
for (int reg : loadRegs)
{
if (left-- == 0)
break;
writeRegs |= (1 << reg) & instr.Info.DstRegs;
LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs);
}
}
}
DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
DirtyRegs |= writeRegs & ~(1 << 15);
}
static const Reg NativeRegAllocOrder[];

View File

@ -364,7 +364,7 @@ void Compiler::Reset()
void Compiler::Comp_SpecialBranchBehaviour()
{
if (CurInstr.BranchFlags & branch_IdleBranch)
OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20));
OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
{
@ -402,6 +402,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
{
CurInstr = instrs[i];
R15 = CurInstr.Addr + (Thumb ? 4 : 8);
CodeRegion = R15 >> 24;
Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
@ -571,8 +572,6 @@ void Compiler::Comp_AddCycles_CDI()
Comp_AddCycles_CD();
else
{
IrregularCycles = true;
s32 cycles;
s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
@ -642,7 +641,7 @@ void Compiler::Comp_AddCycles_CD()
IrregularCycles = true;
}
if (!Thumb && CurInstr.Cond() < 0xE)
if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
else
ConstantCycles += cycles;

View File

@ -1,5 +1,6 @@
#include "ARMJIT_Compiler.h"
#include "../Config.h"
using namespace Gen;
@ -290,7 +291,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
if (size == 16)
addressMask = ~1;
if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
{
u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
Comp_MemLoadLiteral(size, rd, addr);
@ -309,6 +310,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
OpArg rdMapped = MapReg(rd);
OpArg rnMapped = MapReg(rn);
if (Thumb && rn == 15)
rnMapped = Imm32(R15 & ~0x2);
bool inlinePreparation = Num == 1;
u32 constLocalROR32 = 4;
@ -317,7 +320,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
: MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
{
u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
@ -749,9 +752,12 @@ void Compiler::T_Comp_MemImmHalf()
void Compiler::T_Comp_LoadPCRel()
{
u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
u32 offset = (CurInstr.Instr & 0xFF) << 2;
u32 addr = (R15 & ~0x2) + offset;
if (Config::JIT_LiteralOptimisations)
Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
else
Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0);
}
void Compiler::T_Comp_MemSPRel()

View File

@ -365,6 +365,21 @@ Info Decode(bool thumb, u32 num, u32 instr)
if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
res.SpecialKind = special_LoadLiteral;
if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
{
u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
res.NotStrictlyNeeded |= set;
res.DstRegs |= set;
}
if (res.Kind == tk_STMIA || res.Kind == tk_PUSH)
{
u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
if (res.Kind == tk_PUSH && instr & (1 << 8))
set |= (1 << 14);
res.NotStrictlyNeeded |= set;
res.SrcRegs |= set;
}
res.EndBlock |= res.Branches();
if (res.Kind == tk_BCOND)
@ -466,6 +481,19 @@ Info Decode(bool thumb, u32 num, u32 instr)
if ((data & A_LoadMem) && res.SrcRegs == (1 << 15))
res.SpecialKind = special_LoadLiteral;
if (res.Kind == ak_LDM)
{
u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
res.DstRegs |= set;
res.NotStrictlyNeeded |= set;
}
if (res.Kind == ak_STM)
{
u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
res.SrcRegs |= set;
res.NotStrictlyNeeded |= set;
}
if ((instr >> 28) < 0xE)
{

View File

@ -236,7 +236,7 @@ enum
struct Info
{
u16 DstRegs, SrcRegs;
u16 DstRegs, SrcRegs, NotStrictlyNeeded;
u16 Kind;
u8 SpecialKind;

View File

@ -41,6 +41,7 @@ char DSiNANDPath[1024];
bool JIT_Enable = false;
int JIT_MaxBlockSize = 12;
bool JIT_BrancheOptimisations = true;
bool JIT_LiteralOptimisations = true;
#endif
ConfigEntry ConfigFile[] =
@ -58,6 +59,7 @@ ConfigEntry ConfigFile[] =
{"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
{"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
{"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
{"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
#endif
{"", -1, NULL, 0, NULL, 0}

View File

@ -55,6 +55,7 @@ extern char DSiNANDPath[1024];
extern bool JIT_Enable;
extern int JIT_MaxBlockSize;
extern bool JIT_BrancheOptimisations;
extern bool JIT_LiteralOptimisations;
#endif
}

View File

@ -1211,9 +1211,9 @@ void UpdateIRQ(u32 cpu)
if (IME[cpu] & 0x1)
{
arm->IRQ = IE[cpu] & IF[cpu];
arm->IRQ = !!(IE[cpu] & IF[cpu]);
if ((ConsoleType == 1) && cpu)
arm->IRQ |= (IE2 & IF2);
arm->IRQ |= !!(IE2 & IF2);
}
else
{