new block cache and much more...

- more reliable code invalidation detection
- blocks aren't stopped at any branch, but are being followed
if possible to get larger blocks
- idle loop recognition
- optimised literal loads, load/store cycle counting
 and loads/stores from constant addresses
This commit is contained in:
RSDuck 2019-10-03 01:10:59 +02:00
parent 0e26aa4ede
commit 40b88ab05a
18 changed files with 1536 additions and 695 deletions

View File

@ -623,21 +623,26 @@ void ARMv5::ExecuteJIT()
return;
}
ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr);
Cycles += (block ? block : ARMJIT::CompileBlock(this))();
ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr);
if (block)
Cycles += block();
else
ARMJIT::CompileBlock(this);
NDS::ARM9Timestamp += Cycles;
Cycles = 0;
if (IRQ) TriggerIRQ();
if (Halted)
{
if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target)
bool idleLoop = Halted & 0x20;
Halted &= ~0x20;
if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
{
NDS::ARM9Timestamp = NDS::ARM9Target;
}
break;
}
if (IRQ) TriggerIRQ();
NDS::ARM9Timestamp += Cycles;
Cycles = 0;
}
if (Halted == 2)
@ -753,23 +758,28 @@ void ARMv4::ExecuteJIT()
printf("ARMv4 PC in non executable region %08X\n", R[15]);
return;
}
ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr);
Cycles += (block ? block : ARMJIT::CompileBlock(this))();
ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
if (block)
Cycles += block();
else
ARMJIT::CompileBlock(this);
NDS::ARM7Timestamp += Cycles;
Cycles = 0;
// TODO optimize this shit!!!
if (IRQ) TriggerIRQ();
if (Halted)
{
if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target)
bool idleLoop = Halted & 0x20;
Halted &= ~0x20;
if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
{
NDS::ARM7Timestamp = NDS::ARM7Target;
}
break;
}
if (IRQ) TriggerIRQ();
NDS::ARM7Timestamp += Cycles;
Cycles = 0;
}
if (Halted == 2)
@ -779,6 +789,8 @@ void ARMv4::ExecuteJIT()
void ARMv5::FillPipeline()
{
SetupCodeMem(R[15]);
if (CPSR & 0x20)
{
if ((R[15] - 2) & 0x2)
@ -801,6 +813,8 @@ void ARMv5::FillPipeline()
void ARMv4::FillPipeline()
{
SetupCodeMem(R[15]);
if (CPSR & 0x20)
{
NextInstr[0] = CodeRead16(R[15] - 2);

View File

@ -311,7 +311,7 @@ public:
{
*val = BusRead8(addr);
DataRegion = addr >> 24;
DataCycles = NDS::ARM7MemTimings[DataRegion][0];
DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
}
void DataRead16(u32 addr, u32* val)
@ -320,7 +320,7 @@ public:
*val = BusRead16(addr);
DataRegion = addr >> 24;
DataCycles = NDS::ARM7MemTimings[DataRegion][0];
DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
}
void DataRead32(u32 addr, u32* val)
@ -329,7 +329,7 @@ public:
*val = BusRead32(addr);
DataRegion = addr >> 24;
DataCycles = NDS::ARM7MemTimings[DataRegion][2];
DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
}
void DataRead32S(u32 addr, u32* val)
@ -337,14 +337,14 @@ public:
addr &= ~3;
*val = BusRead32(addr);
DataCycles += NDS::ARM7MemTimings[DataRegion][3];
DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
}
void DataWrite8(u32 addr, u8 val)
{
BusWrite8(addr, val);
DataRegion = addr >> 24;
DataCycles = NDS::ARM7MemTimings[DataRegion][0];
DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
}
void DataWrite16(u32 addr, u16 val)
@ -353,7 +353,7 @@ public:
BusWrite16(addr, val);
DataRegion = addr >> 24;
DataCycles = NDS::ARM7MemTimings[DataRegion][0];
DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
}
void DataWrite32(u32 addr, u32 val)
@ -362,7 +362,7 @@ public:
BusWrite32(addr, val);
DataRegion = addr >> 24;
DataCycles = NDS::ARM7MemTimings[DataRegion][2];
DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
}
void DataWrite32S(u32 addr, u32 val)
@ -370,7 +370,7 @@ public:
addr &= ~3;
BusWrite32(addr, val);
DataCycles += NDS::ARM7MemTimings[DataRegion][3];
DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
}

View File

@ -28,6 +28,15 @@ namespace ARMInterpreter
extern void (*ARMInstrTable[4096])(ARM* cpu);
extern void (*THUMBInstrTable[1024])(ARM* cpu);
void A_MSR_IMM(ARM* cpu);
void A_MSR_REG(ARM* cpu);
void A_MRS(ARM* cpu);
void A_MCR(ARM* cpu);
void A_MRC(ARM* cpu);
void A_SVC(ARM* cpu);
void T_SVC(ARM* cpu);
void A_BLX_IMM(ARM* cpu); // I'm a special one look at me
}

View File

@ -1,122 +1,137 @@
#include "ARMJIT.h"
#include <string.h>
#include <assert.h>
#include "Config.h"
#include "ARMJIT_Internal.h"
#include "ARMJIT_x64/ARMJIT_Compiler.h"
#include "ARMInterpreter_ALU.h"
#include "ARMInterpreter_LoadStore.h"
#include "ARMInterpreter_Branch.h"
#include "ARMInterpreter.h"
#include "GPU3D.h"
#include "SPU.h"
#include "Wifi.h"
namespace ARMJIT
{
#define JIT_DEBUGPRINT(msg, ...)
Compiler* compiler;
BlockCache cache;
const u32 ExeMemRegionSizes[] = {
0x8000, // Unmapped Region (dummy)
0x8000, // ITCM
4*1024*1024, // Main RAM
0x8000, // SWRAM
0xA4000, // LCDC
0x8000, // ARM9 BIOS
0x4000, // ARM7 BIOS
0x10000, // ARM7 WRAM
0x40000 // ARM7 WVRAM
};
const u32 ExeMemRegionOffsets[] = {
0,
0x8000,
0x10000,
0x410000,
0x418000,
0x4BC000,
0x4C4000,
0x4C8000,
0x4D8000,
0x518000,
};
#define DUP2(x) x, x
static ptrdiff_t JIT_MEM[2][32] = {
const static ExeMemKind JIT_MEM[2][32] = {
//arm9
{
/* 0X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)),
/* 1X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror
/* 2X*/ DUP2(offsetof(BlockCache, MainRAM)),
/* 3X*/ DUP2(offsetof(BlockCache, SWRAM)),
/* 4X*/ DUP2(-1),
/* 5X*/ DUP2(-1),
/* 6X*/ -1,
offsetof(BlockCache, ARM9_LCDC), // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
/* 7X*/ DUP2(-1),
/* 8X*/ DUP2(-1),
/* 9X*/ DUP2(-1),
/* AX*/ DUP2(-1),
/* BX*/ DUP2(-1),
/* CX*/ DUP2(-1),
/* DX*/ DUP2(-1),
/* EX*/ DUP2(-1),
/* FX*/ DUP2(offsetof(BlockCache, ARM9_BIOS))
/* 0X*/ DUP2(exeMem_ITCM),
/* 1X*/ DUP2(exeMem_ITCM), // mirror
/* 2X*/ DUP2(exeMem_MainRAM),
/* 3X*/ DUP2(exeMem_SWRAM),
/* 4X*/ DUP2(exeMem_Unmapped),
/* 5X*/ DUP2(exeMem_Unmapped),
/* 6X*/ exeMem_Unmapped,
exeMem_LCDC, // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
/* 7X*/ DUP2(exeMem_Unmapped),
/* 8X*/ DUP2(exeMem_Unmapped),
/* 9X*/ DUP2(exeMem_Unmapped),
/* AX*/ DUP2(exeMem_Unmapped),
/* BX*/ DUP2(exeMem_Unmapped),
/* CX*/ DUP2(exeMem_Unmapped),
/* DX*/ DUP2(exeMem_Unmapped),
/* EX*/ DUP2(exeMem_Unmapped),
/* FX*/ DUP2(exeMem_ARM9_BIOS)
},
//arm7
{
/* 0X*/ DUP2(offsetof(BlockCache, ARM7_BIOS)),
/* 1X*/ DUP2(-1),
/* 2X*/ DUP2(offsetof(BlockCache, MainRAM)),
/* 3X*/ offsetof(BlockCache, SWRAM),
offsetof(BlockCache, ARM7_WRAM),
/* 4X*/ DUP2(-1),
/* 5X*/ DUP2(-1),
/* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself,
/* 0X*/ DUP2(exeMem_ARM7_BIOS),
/* 1X*/ DUP2(exeMem_Unmapped),
/* 2X*/ DUP2(exeMem_MainRAM),
/* 3X*/ exeMem_SWRAM,
exeMem_ARM7_WRAM,
/* 4X*/ DUP2(exeMem_Unmapped),
/* 5X*/ DUP2(exeMem_Unmapped),
/* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself,
DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
/* 7X*/ DUP2(-1),
/* 8X*/ DUP2(-1),
/* 9X*/ DUP2(-1),
/* AX*/ DUP2(-1),
/* BX*/ DUP2(-1),
/* CX*/ DUP2(-1),
/* DX*/ DUP2(-1),
/* EX*/ DUP2(-1),
/* FX*/ DUP2(-1)
}
};
static u32 JIT_MASK[2][32] = {
//arm9
{
/* 0X*/ DUP2(0x00007FFF),
/* 1X*/ DUP2(0x00007FFF),
/* 2X*/ DUP2(0x003FFFFF),
/* 3X*/ DUP2(0x00007FFF),
/* 4X*/ DUP2(0x00000000),
/* 5X*/ DUP2(0x00000000),
/* 6X*/ 0x00000000,
0x000FFFFF,
/* 7X*/ DUP2(0x00000000),
/* 8X*/ DUP2(0x00000000),
/* 9X*/ DUP2(0x00000000),
/* AX*/ DUP2(0x00000000),
/* BX*/ DUP2(0x00000000),
/* CX*/ DUP2(0x00000000),
/* DX*/ DUP2(0x00000000),
/* EX*/ DUP2(0x00000000),
/* FX*/ DUP2(0x00007FFF)
},
//arm7
{
/* 0X*/ DUP2(0x00003FFF),
/* 1X*/ DUP2(0x00000000),
/* 2X*/ DUP2(0x003FFFFF),
/* 3X*/ 0x00007FFF,
0x0000FFFF,
/* 4X*/ 0x00000000,
0x0000FFFF,
/* 5X*/ DUP2(0x00000000),
/* 6X*/ DUP2(0x0003FFFF),
/* 7X*/ DUP2(0x00000000),
/* 8X*/ DUP2(0x00000000),
/* 9X*/ DUP2(0x00000000),
/* AX*/ DUP2(0x00000000),
/* BX*/ DUP2(0x00000000),
/* CX*/ DUP2(0x00000000),
/* DX*/ DUP2(0x00000000),
/* EX*/ DUP2(0x00000000),
/* FX*/ DUP2(0x00000000)
/* 7X*/ DUP2(exeMem_Unmapped),
/* 8X*/ DUP2(exeMem_Unmapped),
/* 9X*/ DUP2(exeMem_Unmapped),
/* AX*/ DUP2(exeMem_Unmapped),
/* BX*/ DUP2(exeMem_Unmapped),
/* CX*/ DUP2(exeMem_Unmapped),
/* DX*/ DUP2(exeMem_Unmapped),
/* EX*/ DUP2(exeMem_Unmapped),
/* FX*/ DUP2(exeMem_Unmapped)
}
};
#undef DUP2
/*
translates address to pseudo physical address
- more compact, eliminates mirroring, everything comes in a row
- we only need one translation table
*/
u32 AddrTranslate9[0x2000];
u32 AddrTranslate7[0x4000];
JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
AddressRange CodeRanges[ExeMemSpaceSize / 256];
TinyVector<JitBlock*> JitBlocks;
JitBlock* RestoreCandidates[0x1000] = {NULL};
u32 HashRestoreCandidate(u32 pseudoPhysicalAddr)
{
return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53);
}
void Init()
{
memset(&cache, 0, sizeof(BlockCache));
for (int i = 0; i < 0x2000; i++)
cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL :
(CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8])
+ (((i << 15) & JIT_MASK[0][i >> 8]) >> 1);
{
ExeMemKind kind = JIT_MEM[0][i >> 8];
u32 size = ExeMemRegionSizes[kind];
AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1));
}
for (int i = 0; i < 0x4000; i++)
cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL :
(CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9])
+ (((i << 14) & JIT_MASK[1][i >> 9]) >> 1);
{
ExeMemKind kind = JIT_MEM[1][i >> 9];
u32 size = ExeMemRegionSizes[kind];
AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1));
}
compiler = new Compiler();
}
@ -126,7 +141,7 @@ void DeInit()
delete compiler;
}
void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
{
for (int j = start; j >= 0; j--)
{
@ -144,7 +159,154 @@ void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
}
}
CompiledBlock CompileBlock(ARM* cpu)
bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr)
{
if (thumb)
{
u32 r15 = instr.Addr + 4;
cond = 0xE;
if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
{
targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1;
return true;
}
else if (instr.Info.Kind == ARMInstrInfo::tk_B)
{
s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20;
targetAddr = r15 + offset;
return true;
}
else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND)
{
cond = (instr.Instr >> 8) & 0xF;
s32 offset = (s32)(instr.Instr << 24) >> 23;
targetAddr = r15 + offset;
return true;
}
}
else
{
cond = instr.Cond();
if (instr.Info.Kind == ARMInstrInfo::ak_BL
|| instr.Info.Kind == ARMInstrInfo::ak_B)
{
s32 offset = (s32)(instr.Instr << 8) >> 6;
u32 r15 = instr.Addr + 8;
targetAddr = r15 + offset;
return true;
}
}
return false;
}
bool IsIdleLoop(FetchedInstr* instrs, int instrsCount)
{
// see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678
// it basically checks if one iteration of a loop depends on another
// the rules are quite simple
u16 regsWrittenTo = 0;
u16 regsDisallowedToWrite = 0;
for (int i = 0; i < instrsCount; i++)
{
//printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite);
if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem)
return false;
if (i < instrsCount - 1 && instrs[i].Info.Branches())
return false;
u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15);
u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15);
regsDisallowedToWrite |= srcRegs & ~regsWrittenTo;
if (dstRegs & regsDisallowedToWrite)
return false;
regsWrittenTo |= dstRegs;
}
return true;
}
typedef void (*InterpreterFunc)(ARM* cpu);
#define F(x) &ARMInterpreter::A_##x
#define F_ALU(name, s) \
F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \
F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s)
#define F_MEM_WB(name) \
F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \
F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM)
#define F_MEM_HD(name) \
F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM)
InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] =
{
F_ALU(AND,), F_ALU(AND,_S),
F_ALU(EOR,), F_ALU(EOR,_S),
F_ALU(SUB,), F_ALU(SUB,_S),
F_ALU(RSB,), F_ALU(RSB,_S),
F_ALU(ADD,), F_ALU(ADD,_S),
F_ALU(ADC,), F_ALU(ADC,_S),
F_ALU(SBC,), F_ALU(SBC,_S),
F_ALU(RSC,), F_ALU(RSC,_S),
F_ALU(ORR,), F_ALU(ORR,_S),
F_ALU(MOV,), F_ALU(MOV,_S),
F_ALU(BIC,), F_ALU(BIC,_S),
F_ALU(MVN,), F_ALU(MVN,_S),
F_ALU(TST,),
F_ALU(TEQ,),
F_ALU(CMP,),
F_ALU(CMN,),
F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy),
F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB),
F_MEM_WB(STR),
F_MEM_WB(STRB),
F_MEM_WB(LDR),
F_MEM_WB(LDRB),
F_MEM_HD(STRH),
F_MEM_HD(LDRD),
F_MEM_HD(STRD),
F_MEM_HD(LDRH),
F_MEM_HD(LDRSB),
F_MEM_HD(LDRSH),
F(SWP), F(SWPB),
F(LDM), F(STM),
F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG),
F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC)
};
#undef F_ALU
#undef F_MEM_WB
#undef F_MEM_HD
#undef F
#define F(x) ARMInterpreter::T_##x
InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
{
F(LSL_IMM), F(LSR_IMM), F(ASR_IMM),
F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_),
F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM),
F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG),
F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG),
F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG),
F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG),
F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP),
F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG),
F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM),
F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL),
F(PUSH), F(POP), F(LDMIA), F(STMIA),
F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2),
F(UNK), F(SVC),
NULL // BL_LONG psudo opcode
};
#undef F
void CompileBlock(ARM* cpu)
{
bool thumb = cpu->CPSR & 0x20;
@ -153,17 +315,41 @@ CompiledBlock CompileBlock(ARM* cpu)
if (Config::JIT_MaxBlockSize > 32)
Config::JIT_MaxBlockSize = 32;
u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
if (!(cpu->Num == 0
? IsMapped<0>(blockAddr)
: IsMapped<1>(blockAddr)))
{
printf("Trying to compile a block in unmapped memory: %x\n", blockAddr);
}
u32 pseudoPhysicalAddr = cpu->Num == 0
? TranslateAddr<0>(blockAddr)
: TranslateAddr<1>(blockAddr);
FetchedInstr instrs[Config::JIT_MaxBlockSize];
int i = 0;
u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
u32 r15 = cpu->R[15];
u32 addresseRanges[32] = {};
u32 numAddressRanges = 0;
cpu->FillPipeline();
u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
u32 nextInstrAddr[2] = {blockAddr, r15};
JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n",
blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2],
cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated);
u32 lastSegmentStart = blockAddr;
do
{
r15 += thumb ? 2 : 4;
instrs[i].BranchFlags = 0;
instrs[i].SetFlags = 0;
instrs[i].Instr = nextInstr[0];
instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
@ -171,6 +357,25 @@ CompiledBlock CompileBlock(ARM* cpu)
instrs[i].Addr = nextInstrAddr[0];
nextInstrAddr[0] = nextInstrAddr[1];
nextInstrAddr[1] = r15;
JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr);
u32 translatedAddr = (cpu->Num == 0
? TranslateAddr<0>(instrs[i].Addr)
: TranslateAddr<1>(instrs[i].Addr)) & ~0xFF;
if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1])
{
bool returning = false;
for (int j = 0; j < numAddressRanges; j++)
{
if (addresseRanges[j] == translatedAddr)
{
returning = true;
break;
}
}
if (!returning)
addresseRanges[numAddressRanges++] = translatedAddr;
}
if (cpu->Num == 0)
{
@ -198,6 +403,34 @@ CompiledBlock CompileBlock(ARM* cpu)
instrs[i].NextInstr[1] = nextInstr[1];
instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
cpu->R[15] = r15;
cpu->CurInstr = instrs[i].Instr;
cpu->CodeCycles = instrs[i].CodeCycles;
if (thumb)
{
InterpretTHUMB[instrs[i].Info.Kind](cpu);
}
else
{
if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM)
{
ARMInterpreter::A_BLX_IMM(cpu);
}
else
{
u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0);
assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM);
if (cpu->CheckCondition(instrs[i].Cond()))
InterpretARM[instrs[i].Info.Kind](cpu);
else
cpu->AddCycles_C();
}
}
instrs[i].DataCycles = cpu->DataCycles;
instrs[i].DataRegion = cpu->DataRegion;
if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
&& instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1)
{
@ -208,40 +441,340 @@ CompiledBlock CompileBlock(ARM* cpu)
instrs[i - 1].Info.EndBlock = true;
i--;
}
if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations)
{
bool hasBranched = cpu->R[15] != r15;
u32 cond, target;
bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target);
JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
if (staticBranch)
{
bool isBackJump = false;
if (hasBranched)
{
for (int j = 0; j < i; j++)
{
if (instrs[i].Addr == target)
{
isBackJump = true;
break;
}
}
}
if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
{
// we might have an idle loop
u32 offset = (target - blockAddr) / (thumb ? 2 : 4);
if (IsIdleLoop(instrs + offset, i - offset + 1))
{
instrs[i].BranchFlags |= branch_IdleBranch;
JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
}
}
else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
{
u32 targetPseudoPhysical = cpu->Num == 0
? TranslateAddr<0>(target)
: TranslateAddr<1>(target);
r15 = target + (thumb ? 2 : 4);
assert(r15 == cpu->R[15]);
JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target);
nextInstr[0] = cpu->NextInstr[0];
nextInstr[1] = cpu->NextInstr[1];
nextInstrAddr[0] = target;
nextInstrAddr[1] = r15;
lastSegmentStart = target;
instrs[i].Info.EndBlock = false;
if (cond < 0xE)
instrs[i].BranchFlags |= branch_FollowCondTaken;
}
}
if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize)
{
instrs[i].Info.EndBlock = false;
instrs[i].BranchFlags |= branch_FollowCondNotTaken;
}
}
i++;
bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile)
floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF);
} while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize);
bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
} while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted);
floodFillSetFlags(instrs, i - 1, 0xF);
u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
JitBlock* prevBlock = RestoreCandidates[restoreSlot];
bool mayRestore = true;
if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr)
{
RestoreCandidates[restoreSlot] = NULL;
if (prevBlock->NumInstrs == i)
{
for (int j = 0; j < i; j++)
{
if (prevBlock->Instrs()[j] != instrs[j].Instr)
{
mayRestore = false;
break;
}
}
}
else
mayRestore = false;
CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
if (cpu->Num == 0)
InsertBlock<0>(blockAddr, block);
if (prevBlock->NumAddresses == numAddressRanges)
{
for (int j = 0; j < numAddressRanges; j++)
{
if (prevBlock->AddressRanges()[j] != addresseRanges[j])
{
mayRestore = false;
break;
}
}
}
else
mayRestore = false;
}
else
InsertBlock<1>(blockAddr, block);
{
mayRestore = false;
prevBlock = NULL;
}
return block;
JitBlock* block;
if (!mayRestore)
{
if (prevBlock)
delete prevBlock;
block = new JitBlock(i, numAddressRanges);
for (int j = 0; j < i; j++)
block->Instrs()[j] = instrs[j].Instr;
for (int j = 0; j < numAddressRanges; j++)
block->AddressRanges()[j] = addresseRanges[j];
block->StartAddr = blockAddr;
block->PseudoPhysicalAddr = pseudoPhysicalAddr;
FloodFillSetFlags(instrs, i - 1, 0xF);
block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i);
}
else
{
JIT_DEBUGPRINT("restored! %p\n", prevBlock);
block = prevBlock;
}
for (int j = 0; j < numAddressRanges; j++)
{
assert(addresseRanges[j] == block->AddressRanges()[j]);
CodeRanges[addresseRanges[j] / 256].Blocks.Add(block);
}
FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
JitBlocks.Add(block);
}
void InvalidateBlockCache()
void InvalidateByAddr(u32 pseudoPhysical)
{
JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
AddressRange* range = &CodeRanges[pseudoPhysical / 256];
int startLength = range->Blocks.Length;
for (int i = 0; i < range->Blocks.Length; i++)
{
assert(range->Blocks.Length == startLength);
JitBlock* block = range->Blocks[i];
for (int j = 0; j < block->NumAddresses; j++)
{
u32 addr = block->AddressRanges()[j];
if ((addr / 256) != (pseudoPhysical / 256))
{
AddressRange* otherRange = &CodeRanges[addr / 256];
assert(otherRange != range);
assert(otherRange->Blocks.RemoveByValue(block));
}
}
assert(JitBlocks.RemoveByValue(block));
FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
delete RestoreCandidates[slot];
RestoreCandidates[slot] = block;
}
if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
range->TimesInvalidated++;
range->Blocks.Clear();
}
void InvalidateByAddr7(u32 addr)
{
u32 pseudoPhysical = TranslateAddr<1>(addr);
if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false))
InvalidateByAddr(pseudoPhysical);
}
void InvalidateITCM(u32 addr)
{
u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM];
if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0)
InvalidateByAddr(pseudoPhysical);
}
void InvalidateAll()
{
JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
for (int i = 0; i < JitBlocks.Length; i++)
{
JitBlock* block = JitBlocks[i];
FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
for (int j = 0; j < block->NumAddresses; j++)
{
u32 addr = block->AddressRanges()[j];
AddressRange* range = &CodeRanges[addr / 256];
range->Blocks.Clear();
if (range->TimesInvalidated + 1 > range->TimesInvalidated)
range->TimesInvalidated++;
}
u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
delete RestoreCandidates[slot];
RestoreCandidates[slot] = block;
}
JitBlocks.Clear();
}
void ResetBlockCache()
{
printf("Resetting JIT block cache...\n");
memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
memset(FastBlockAccess, 0, sizeof(FastBlockAccess));
for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++)
{
if (RestoreCandidates[i])
{
delete RestoreCandidates[i];
RestoreCandidates[i] = NULL;
}
}
for (int i = 0; i < JitBlocks.Length; i++)
{
JitBlock* block = JitBlocks[i];
for (int j = 0; j < block->NumAddresses; j++)
{
u32 addr = block->AddressRanges()[j];
CodeRanges[addr / 256].Blocks.Clear();
CodeRanges[addr / 256].TimesInvalidated = 0;
}
delete block;
}
JitBlocks.Clear();
compiler->Reset();
}
void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
{
if (cpu->Num == 0)
{
if ((addr & 0xFF000000) == 0x04000000)
{
/*
unfortunately we can't map GPU2D this way
since it's hidden inside an object
though GPU3D registers are accessed much more intensive
*/
if (addr >= 0x04000320 && addr < 0x040006A4)
{
switch (size | store)
{
case 8: return (void*)GPU3D::Read8;
case 9: return (void*)GPU3D::Write8;
case 16: return (void*)GPU3D::Read16;
case 17: return (void*)GPU3D::Write16;
case 32: return (void*)GPU3D::Read32;
case 33: return (void*)GPU3D::Write32;
}
}
switch (size | store)
{
case 8: return (void*)NDS::ARM9IORead8;
case 9: return (void*)NDS::ARM9IOWrite8;
case 16: return (void*)NDS::ARM9IORead16;
case 17: return (void*)NDS::ARM9IOWrite16;
case 32: return (void*)NDS::ARM9IORead32;
case 33: return (void*)NDS::ARM9IOWrite32;
}
}
}
else
{
switch (addr & 0xFF800000)
{
case 0x04000000:
if (addr >= 0x04000400 && addr < 0x04000520)
{
switch (size | store)
{
case 8: return (void*)SPU::Read8;
case 9: return (void*)SPU::Write8;
case 16: return (void*)SPU::Read16;
case 17: return (void*)SPU::Write16;
case 32: return (void*)SPU::Read32;
case 33: return (void*)SPU::Write32;
}
}
switch (size | store)
{
case 8: return (void*)NDS::ARM7IORead8;
case 9: return (void*)NDS::ARM7IOWrite8;
case 16: return (void*)NDS::ARM7IORead16;
case 17: return (void*)NDS::ARM7IOWrite16;
case 32: return (void*)NDS::ARM7IORead32;
case 33: return (void*)NDS::ARM7IOWrite32;
}
break;
case 0x04800000:
if (addr < 0x04810000 && size == 16)
{
if (store)
return (void*)Wifi::Write;
else
return (void*)Wifi::Read;
}
break;
}
}
return NULL;
}
}

View File

@ -9,142 +9,67 @@
namespace ARMJIT
{
typedef u32 (*CompiledBlock)();
struct FetchedInstr
enum ExeMemKind
{
u32 A_Reg(int pos) const
{
return (Instr >> pos) & 0xF;
}
u32 T_Reg(int pos) const
{
return (Instr >> pos) & 0x7;
}
u32 Cond() const
{
return Instr >> 28;
}
u8 SetFlags;
u32 Instr;
u32 NextInstr[2];
u32 Addr;
u8 CodeCycles;
ARMInstrInfo::Info Info;
exeMem_Unmapped = 0,
exeMem_ITCM,
exeMem_MainRAM,
exeMem_SWRAM,
exeMem_LCDC,
exeMem_ARM9_BIOS,
exeMem_ARM7_BIOS,
exeMem_ARM7_WRAM,
exeMem_ARM7_WVRAM,
exeMem_Count
};
/*
Copied from DeSmuME
Some names where changed to match the nomenclature of melonDS
extern const u32 ExeMemRegionOffsets[];
extern const u32 ExeMemRegionSizes[];
Since it's nowhere explained and atleast I needed some time to get behind it,
here's a summary on how it works:
more or less all memory locations from which code can be executed are
represented by an array of function pointers, which point to null or
a function which executes a block instructions starting from there.
typedef u32 (*JitBlockEntry)();
The most significant 4 bits of each address is ignored. This 28 bit space is
divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which
a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB
are the sizes of the smallest contigous memory region mapped to the respective CPU.
Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary,
we only need every second half word to be adressable.
extern u32 AddrTranslate9[0x2000];
extern u32 AddrTranslate7[0x4000];
In case a memory write hits mapped memory, the function block at this
address is set to null, so it's recompiled the next time it's executed.
This method has disadvantages, namely that only writing to the
first instruction of a block marks it as invalid and that memory remapping
(SWRAM and VRAM) isn't taken into account.
*/
struct BlockCache
{
CompiledBlock* AddrMapping9[0x2000] = {0};
CompiledBlock* AddrMapping7[0x4000] = {0};
CompiledBlock MainRAM[4*1024*1024/2];
CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
CompiledBlock ARM9_ITCM[0x8000/2];
CompiledBlock ARM9_LCDC[0xA4000/2];
CompiledBlock ARM9_BIOS[0x8000/2];
CompiledBlock ARM7_BIOS[0x4000/2];
CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
};
extern BlockCache cache;
const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
template <u32 num>
inline bool IsMapped(u32 addr)
{
if (num == 0)
return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped];
else
return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped];
}
template <u32 num>
inline CompiledBlock LookUpBlock(u32 addr)
inline u32 TranslateAddr(u32 addr)
{
if (num == 0)
return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1];
return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF);
else
return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
}
template <u32 num>
inline void Invalidate16(u32 addr)
inline JitBlockEntry LookUpBlock(u32 addr)
{
if (IsMapped<num>(addr))
{
if (num == 0)
cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL;
else
cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
}
}
template <u32 num>
inline void Invalidate32(u32 addr)
{
if (IsMapped<num>(addr))
{
if (num == 0)
{
CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
page[(addr & 0x7FFF) >> 1] = NULL;
page[((addr + 2) & 0x7FFF) >> 1] = NULL;
}
else
{
CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
page[(addr & 0x3FFF) >> 1] = NULL;
page[((addr + 2) & 0x3FFF) >> 1] = NULL;
}
}
}
template <u32 num>
inline void InsertBlock(u32 addr, CompiledBlock func)
{
if (num == 0)
cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func;
else
cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
return FastBlockAccess[TranslateAddr<num>(addr) / 2];
}
void Init();
void DeInit();
CompiledBlock CompileBlock(ARM* cpu);
void InvalidateByAddr(u32 pseudoPhysical);
void InvalidateAll();
void InvalidateBlockCache();
void InvalidateITCM(u32 addr);
void InvalidateByAddr7(u32 addr);
void CompileBlock(ARM* cpu);
void ResetBlockCache();
}

198
src/ARMJIT_Internal.h Normal file
View File

@ -0,0 +1,198 @@
#ifndef ARMJIT_INTERNAL_H
#define ARMJIT_INTERNAL_H
#include "types.h"
#include <stdint.h>
#include "ARMJIT.h"
// here lands everything which doesn't fit into ARMJIT.h
// where it would be included by pretty much everything
namespace ARMJIT
{
enum
{
branch_IdleBranch = 1 << 0,
branch_FollowCondTaken = 1 << 1,
branch_FollowCondNotTaken = 1 << 2
};
struct FetchedInstr
{
u32 A_Reg(int pos) const
{
return (Instr >> pos) & 0xF;
}
u32 T_Reg(int pos) const
{
return (Instr >> pos) & 0x7;
}
u32 Cond() const
{
return Instr >> 28;
}
u8 BranchFlags;
u8 SetFlags;
u32 Instr;
u32 NextInstr[2];
u32 Addr;
u8 CodeCycles;
u8 DataCycles;
u8 DataRegion;
ARMInstrInfo::Info Info;
};
/*
TinyVector
- because reinventing the wheel is the best!
- meant to be used very often, with not so many elements
max 1 << 16 elements
- doesn't allocate while no elements are inserted
- not stl confirmant of course
- probably only works with POD types
- remove operations don't preserve order, but O(1)!
*/
template <typename T>
struct __attribute__((packed)) TinyVector
{
T* Data = NULL;
u16 Capacity = 0;
u32 Length = 0; // make it 32 bit so we don't need movzx
~TinyVector()
{
delete[] Data;
}
void MakeCapacity(u32 capacity)
{
assert(capacity <= UINT16_MAX);
assert(capacity > Capacity);
T* newMem = new T[capacity];
if (Data != NULL)
memcpy(newMem, Data, sizeof(Data) * Length);
T* oldData = Data;
Data = newMem;
if (oldData != NULL)
delete[] oldData;
Capacity = capacity;
}
void Clear()
{
Length = 0;
}
void Add(T element)
{
assert(Length + 1 <= UINT16_MAX);
if (Length + 1 > Capacity)
MakeCapacity(((Capacity + 4) * 3) / 2);
Data[Length++] = element;
}
void Remove(int index)
{
assert(index >= 0 && index < Length);
Length--;
Data[index] = Data[Length];
/*for (int i = index; i < Length; i++)
Data[i] = Data[i + 1];*/
}
int Find(T needle)
{
for (int i = 0; i < Length; i++)
{
if (Data[i] == needle)
return i;
}
return -1;
}
bool RemoveByValue(T needle)
{
for (int i = 0; i < Length; i++)
{
if (Data[i] == needle)
{
Remove(i);
return true;
}
}
return false;
}
T& operator[](int index)
{
assert(index >= 0 && index < Length);
return Data[index];
}
};
class JitBlock
{
public:
JitBlock(u32 numInstrs, u32 numAddresses)
{
NumInstrs = numInstrs;
NumAddresses = numAddresses;
Data = new u32[numInstrs + numAddresses];
}
~JitBlock()
{
delete[] Data;
}
u32 StartAddr;
u32 PseudoPhysicalAddr;
u32 NumInstrs;
u32 NumAddresses;
JitBlockEntry EntryPoint;
u32* Instrs()
{ return Data; }
u32* AddressRanges()
{ return Data + NumInstrs; }
private:
/*
0..<NumInstrs - the instructions of the block
NumInstrs..<(NumLinks + NumInstrs) - pseudo physical addresses where the block is located
(atleast one, the pseudo physical address of the block)
*/
u32* Data;
};
// size should be 16 bytes because I'm to lazy to use mul and whatnot
struct __attribute__((packed)) AddressRange
{
TinyVector<JitBlock*> Blocks;
u16 TimesInvalidated;
};
extern AddressRange CodeRanges[ExeMemSpaceSize / 256];
typedef void (*InterpreterFunc)(ARM* cpu);
extern InterpreterFunc InterpretARM[];
extern InterpreterFunc InterpretTHUMB[];
void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
}
#endif

View File

@ -60,15 +60,46 @@ public:
assert("Welp!");
}
void PutLiteral(int reg, u32 val)
{
LiteralsLoaded |= (1 << reg);
LiteralValues[reg] = val;
}
void UnloadLiteral(int reg)
{
LiteralsLoaded &= ~(1 << reg);
}
bool IsLiteral(int reg)
{
return LiteralsLoaded & (1 << reg);
}
void PrepareExit()
{
BitSet16 dirtyRegs(DirtyRegs);
for (int reg : dirtyRegs)
Compiler->SaveReg(reg, Mapping[reg]);
}
void Flush()
{
BitSet16 loadedSet(LoadedRegs);
for (int reg : loadedSet)
UnloadRegister(reg);
LiteralsLoaded = 0;
}
void Prepare(bool thumb, int i)
{
if (LoadedRegs & (1 << 15))
UnloadRegister(15);
BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs);
for (int reg : invalidedLiterals)
UnloadLiteral(reg);
u16 futureNeeded = 0;
int ranking[16];
for (int j = 0; j < 16; j++)
@ -86,7 +117,7 @@ public:
for (int reg : neverNeededAgain)
UnloadRegister(reg);
FetchedInstr Instr = Instrs[i];
FetchedInstr Instr = Instrs[i];
u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
if (needToBeLoaded != BitSet16(0))
@ -125,6 +156,9 @@ public:
static const int NativeRegsAvailable;
Reg Mapping[16];
u32 LiteralValues[16];
u16 LiteralsLoaded = 0;
u32 NativeRegsUsed = 0;
u16 LoadedRegs = 0;
u16 DirtyRegs = 0;

View File

@ -213,7 +213,13 @@ void Compiler::A_Comp_MovOp()
MOV(32, rd, op2);
if (((CurInstr.Instr >> 21) & 0xF) == 0xF)
{
NOT(32, rd);
if (op2.IsImm() && CurInstr.Cond() == 0xE)
RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32());
}
else if (op2.IsImm() && CurInstr.Cond() == 0xE)
RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32());
if (S)
{
@ -564,7 +570,13 @@ void Compiler::T_Comp_AddSub_()
Comp_AddCycles_C();
if (op & 1)
// special case for thumb mov being alias to add rd, rn, #0
if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0)
{
if (rd != rs)
MOV(32, rd, rs);
}
else if (op & 1)
Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
else
Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
@ -614,7 +626,7 @@ void Compiler::T_Comp_ALU()
u32 op = (CurInstr.Instr >> 6) & 0xF;
if ((op >= 0x2 && op < 0x4) || op == 0x7)
Comp_AddCycles_CI(1);
Comp_AddCycles_CI(1); // shift by reg
else
Comp_AddCycles_C();

View File

@ -16,9 +16,6 @@ int squeezePointer(T* ptr)
void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
{
// we can simplify constant branches by a lot
// it's not completely safe to assume stuff like, which instructions to preload
// we'll see how it works out
IrregularCycles = true;
u32 newPC;
@ -39,18 +36,12 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
{
ARMv5* cpu9 = (ARMv5*)CurCPU;
u32 oldregion = R15 >> 24;
u32 newregion = addr >> 24;
u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
cpu9->RegionCodeCycles = regionCodeCycles;
MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
bool setupRegion = newregion != oldregion;
if (setupRegion)
cpu9->SetupCodeMem(addr);
if (Exit)
MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
if (addr & 0x1)
{
@ -83,12 +74,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
cycles += cpu9->CodeCycles;
}
MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem)));
MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask));
cpu9->RegionCodeCycles = compileTimeCodeCycles;
if (setupRegion)
cpu9->SetupCodeMem(R15);
}
else
{
@ -100,8 +86,11 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
cpu7->CodeRegion = codeRegion;
cpu7->CodeCycles = codeCycles;
MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
if (Exit)
{
MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
}
if (addr & 0x1)
{
@ -133,7 +122,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
cpu7->CodeCycles = addr >> 15;
}
MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
if (Exit)
MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
ConstantCycles += cycles;
else
@ -219,10 +209,23 @@ void Compiler::T_Comp_BCOND()
s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
Comp_JumpTo(R15 + offset + 1, true);
Comp_SpecialBranchBehaviour();
FixupBranch skipFailed = J();
SetJumpTarget(skipExecute);
if (CurInstr.BranchFlags & branch_FollowCondTaken)
{
RegCache.PrepareExit();
SaveCPSR(false);
MOV(32, R(RAX), Imm32(ConstantCycles));
ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
RET();
}
Comp_AddCycles_C(true);
SetJumpTarget(skipFailed);
SetJumpTarget(skipFailed);
}
void Compiler::T_Comp_B()

View File

@ -72,12 +72,15 @@ Compiler::Compiler()
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 2; j++)
{
MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i);
MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i);
MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i);
}
}
MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8;
MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8;
MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16;
MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16;
MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32;
MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32;
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++)
{
@ -179,12 +182,13 @@ void Compiler::LoadCPSR()
MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR)));
}
void Compiler::SaveCPSR()
void Compiler::SaveCPSR(bool flagClean)
{
if (CPSRDirty)
{
MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR));
CPSRDirty = false;
if (flagClean)
CPSRDirty = false;
}
}
@ -204,6 +208,9 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg)
// invalidates RSCRATCH and RSCRATCH3
Gen::FixupBranch Compiler::CheckCondition(u32 cond)
{
// hack, ldm/stm can get really big TODO: make this better
bool ldmStm = !Thumb &&
(CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM);
if (cond >= 0x8)
{
static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!");
@ -213,14 +220,14 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond)
SHL(32, R(RSCRATCH), R(RSCRATCH3));
TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
return J_CC(CC_Z);
return J_CC(CC_Z, ldmStm);
}
else
{
// could have used a LUT, but then where would be the fun?
TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
return J_CC(cond & 1 ? CC_NZ : CC_Z);
return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm);
}
}
@ -354,25 +361,34 @@ void Compiler::Reset()
SetCodePtr(ResetStart);
}
CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
void Compiler::Comp_SpecialBranchBehaviour()
{
if (CurInstr.BranchFlags & branch_IdleBranch)
OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20));
if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
{
RegCache.PrepareExit();
SaveCPSR(false);
MOV(32, R(RAX), Imm32(ConstantCycles));
ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
RET();
}
}
JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
{
if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
InvalidateBlockCache();
ResetBlockCache();
ConstantCycles = 0;
Thumb = cpu->CPSR & 0x20;
Thumb = thumb;
Num = cpu->Num;
CodeRegion = cpu->CodeRegion;
CodeRegion = instrs[0].Addr >> 24;
CurCPU = cpu;
CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
if (!(Num == 0
? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4))
: IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4))))
{
printf("Trying to compile a block in unmapped memory\n");
}
JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
@ -380,7 +396,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
LoadCPSR();
// TODO: this is ugly as a whole, do better
RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
for (int i = 0; i < instrsCount; i++)
@ -388,21 +403,25 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
CurInstr = instrs[i];
R15 = CurInstr.Addr + (Thumb ? 4 : 8);
Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
CompileFunc comp = Thumb
? T_Comp[CurInstr.Info.Kind]
: A_Comp[CurInstr.Info.Kind];
bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
{
MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
if (comp == NULL)
{
MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
SaveCPSR();
}
}
if (comp != NULL)
RegCache.Prepare(Thumb, i);
else
@ -410,12 +429,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
if (Thumb)
{
u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
if (comp == NULL)
{
MOV(64, R(ABI_PARAM1), R(RCPU));
ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]);
}
else
(this->*comp)();
@ -434,7 +452,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
}
}
else if (cond == 0xF)
{
Comp_AddCycles_C();
}
else
{
IrregularCycles = false;
@ -443,25 +463,36 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
if (cond < 0xE)
skipExecute = CheckCondition(cond);
u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0);
if (comp == NULL)
{
MOV(64, R(ABI_PARAM1), R(RCPU));
ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]);
}
else
(this->*comp)();
Comp_SpecialBranchBehaviour();
if (CurInstr.Cond() < 0xE)
{
if (IrregularCycles)
if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
{
FixupBranch skipFailed = J();
SetJumpTarget(skipExecute);
Comp_AddCycles_C(true);
if (CurInstr.BranchFlags & branch_FollowCondTaken)
{
RegCache.PrepareExit();
SaveCPSR(false);
MOV(32, R(RAX), Imm32(ConstantCycles));
ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
RET();
}
SetJumpTarget(skipFailed);
}
else
@ -483,6 +514,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
RET();
/*FILE* codeout = fopen("codeout", "a");
fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout);
fclose(codeout);*/
return res;
}
@ -528,4 +565,89 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
}
}
void Compiler::Comp_AddCycles_CDI()
{
if (Num == 0)
Comp_AddCycles_CD();
else
{
IrregularCycles = true;
s32 cycles;
s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
s32 numD = CurInstr.DataCycles;
if (CurInstr.DataRegion == 0x02) // mainRAM
{
if (CodeRegion == 0x02)
cycles = numC + numD;
else
{
numC++;
cycles = std::max(numC + numD - 3, std::max(numC, numD));
}
}
else if (CodeRegion == 0x02)
{
numD++;
cycles = std::max(numC + numD - 3, std::max(numC, numD));
}
else
{
cycles = numC + numD + 1;
}
printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles);
if (!Thumb && CurInstr.Cond() < 0xE)
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
else
ConstantCycles += cycles;
}
}
void Compiler::Comp_AddCycles_CD()
{
u32 cycles = 0;
if (Num == 0)
{
s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
s32 numD = CurInstr.DataCycles;
//if (DataRegion != CodeRegion)
cycles = std::max(numC + numD - 6, std::max(numC, numD));
IrregularCycles = cycles != numC;
}
else
{
s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
s32 numD = CurInstr.DataCycles;
if (CurInstr.DataRegion == 0x02)
{
if (CodeRegion == 0x02)
cycles += numC + numD;
else
cycles += std::max(numC + numD - 3, std::max(numC, numD));
}
else if (CodeRegion == 0x02)
{
cycles += std::max(numC + numD - 3, std::max(numC, numD));
}
else
{
cycles += numC + numD;
}
IrregularCycles = true;
}
if (!Thumb && CurInstr.Cond() < 0xE)
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
else
ConstantCycles += cycles;
}
}

View File

@ -4,6 +4,7 @@
#include "../dolphin/x64Emitter.h"
#include "../ARMJIT.h"
#include "../ARMJIT_Internal.h"
#include "../ARMJIT_RegisterCache.h"
namespace ARMJIT
@ -16,6 +17,32 @@ const Gen::X64Reg RSCRATCH = Gen::EAX;
const Gen::X64Reg RSCRATCH2 = Gen::EDX;
const Gen::X64Reg RSCRATCH3 = Gen::ECX;
struct ComplexOperand
{
ComplexOperand()
{}
ComplexOperand(u32 imm)
: IsImm(true), Imm(imm)
{}
ComplexOperand(int reg, int op, int amount)
: IsImm(false)
{
Reg.Reg = reg;
Reg.Op = op;
Reg.Amount = amount;
}
bool IsImm;
union
{
struct
{
int Reg, Op, Amount;
} Reg;
u32 Imm;
};
};
class Compiler : public Gen::XEmitter
{
@ -24,7 +51,7 @@ public:
void Reset();
CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
void LoadReg(int reg, Gen::X64Reg nativeReg);
void SaveReg(int reg, Gen::X64Reg nativeReg);
@ -39,6 +66,8 @@ public:
void Comp_AddCycles_C(bool forceNonConstant = false);
void Comp_AddCycles_CI(u32 i);
void Comp_AddCycles_CI(Gen::X64Reg i, int add);
void Comp_AddCycles_CDI();
void Comp_AddCycles_CD();
enum
{
@ -92,8 +121,17 @@ public:
void T_Comp_BL_LONG_2();
void T_Comp_BL_Merged();
void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
enum
{
memop_Writeback = 1 << 0,
memop_Post = 1 << 1,
memop_SignExtend = 1 << 2,
memop_Store = 1 << 3,
memop_SubtractOffset = 1 << 4
};
void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags);
s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
void Comp_MemLoadLiteral(int size, int rd, u32 addr);
void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@ -105,8 +143,9 @@ public:
void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
void Comp_SpecialBranchBehaviour();
void* Gen_MemoryRoutine9(bool store, int size);
void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size);
void* Gen_MemoryRoutineSeq9(bool store, bool preinc);
void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM);
@ -117,10 +156,9 @@ public:
Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed);
Gen::OpArg A_Comp_GetMemWBOffset();
void LoadCPSR();
void SaveCPSR();
void SaveCPSR(bool flagClean = true);
bool FlagsNZRequired()
{ return CurInstr.SetFlags & 0xC; }
@ -139,10 +177,11 @@ public:
u8* ResetStart;
u32 CodeMemSize;
bool Exit;
bool IrregularCycles;
void* MemoryFuncs9[3][2];
void* MemoryFuncs7[3][2][2];
void* MemoryFuncs7[3][2];
void* MemoryFuncsSeq9[2][2];
void* MemoryFuncsSeq7[2][2][2];

View File

@ -27,51 +27,7 @@ int squeezePointer(T* ptr)
/*
address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows)
store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
code cycles - ABI_PARAM3
*/
#define CALC_CYCLES_9(numC, numD, scratch) \
LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \
CMP(32, R(numC), R(numD)); \
CMOVcc(32, numD, R(numC), CC_G); \
CMP(32, R(numD), R(scratch)); \
CMOVcc(32, scratch, R(numD), CC_G); \
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch));
#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \
if (codeMainRAM) \
{ \
LEA(32, scratch, MRegSum(numD, numC)); \
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
} \
else \
{ \
if (!store) \
ADD(32, R(numC), Imm8(1)); \
LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
CMP(32, R(numD), R(numC)); \
CMOVcc(32, numC, R(numD), CC_G); \
CMP(32, R(numC), R(scratch)); \
CMOVcc(32, scratch, R(numC), CC_G); \
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
}
#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \
if (codeMainRAM) \
{ \
if (!store) \
ADD(32, R(numD), Imm8(1)); \
LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
CMP(32, R(numD), R(numC)); \
CMOVcc(32, numC, R(numD), CC_G); \
CMP(32, R(numC), R(scratch)); \
CMOVcc(32, scratch, R(numC), CC_G); \
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
} \
else \
{ \
LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
}
void* Compiler::Gen_MemoryRoutine9(bool store, int size)
{
u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
@ -86,12 +42,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
FixupBranch insideITCM = J_CC(CC_B);
// cycle counting!
MOV(32, R(ABI_PARAM4), R(ABI_PARAM1));
SHR(32, R(ABI_PARAM4), Imm8(12));
MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1)));
CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
if (store)
{
if (size > 8)
@ -127,7 +77,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
}
SetJumpTarget(insideDTCM);
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
if (store)
MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
@ -146,16 +95,22 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
RET();
SetJumpTarget(insideITCM);
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
if (store)
{
MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2));
XOR(32, R(RSCRATCH), R(RSCRATCH));
MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH));
if (size == 32)
MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH));
// if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code!
static_assert(sizeof(AddressRange) == 16);
LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM]));
MOV(32, R(RSCRATCH), R(ABI_PARAM1));
SHR(32, R(RSCRATCH), Imm8(8));
SHL(32, R(RSCRATCH), Imm8(4));
CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
FixupBranch noCode = J_CC(CC_Z);
JMP((u8*)InvalidateByAddr, true);
SetJumpTarget(noCode);
}
else
{
@ -176,83 +131,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
return res;
}
void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
{
u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
AlignCode4();
void* res = GetWritableCodePtr();
MOV(32, R(RSCRATCH), R(ABI_PARAM1));
SHR(32, R(RSCRATCH), Imm8(15));
MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
MOV(32, R(RSCRATCH), R(ABI_PARAM1));
AND(32, R(RSCRATCH), Imm32(0xFF000000));
CMP(32, R(RSCRATCH), Imm32(0x02000000));
FixupBranch outsideMainRAM = J_CC(CC_NE);
CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
MOV(32, R(ABI_PARAM3), R(ABI_PARAM1));
AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask));
if (store)
{
MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2));
XOR(32, R(RSCRATCH), R(RSCRATCH));
MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH));
if (size == 32)
MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH));
}
else
{
MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)));
if (size == 32)
{
if (ABI_PARAM1 != ECX)
MOV(32, R(ECX), R(ABI_PARAM1));
AND(32, R(ECX), Imm8(3));
SHL(32, R(ECX), Imm8(3));
ROR_(32, R(RSCRATCH), R(ECX));
}
}
RET();
SetJumpTarget(outsideMainRAM);
CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
if (store)
{
if (size > 8)
AND(32, R(ABI_PARAM1), Imm32(addressMask));
switch (size)
{
case 32: JMP((u8*)NDS::ARM7Write32, true); break;
case 16: JMP((u8*)NDS::ARM7Write16, true); break;
case 8: JMP((u8*)NDS::ARM7Write8, true); break;
}
}
else
{
if (size == 32)
{
ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
AND(32, R(ABI_PARAM1), Imm32(addressMask));
ABI_CallFunction(NDS::ARM7Read32);
ABI_PopRegistersAndAdjustStack({ECX}, 8);
AND(32, R(ECX), Imm8(3));
SHL(32, R(ECX), Imm8(3));
ROR_(32, R(RSCRATCH), R(ECX));
RET();
}
else if (size == 16)
{
AND(32, R(ABI_PARAM1), Imm32(addressMask));
JMP((u8*)NDS::ARM7Read16, true);
}
else
JMP((u8*)NDS::ARM7Read8, true);
}
return res;
}
#define MEMORY_SEQ_WHILE_COND \
if (!store) \
MOV(32, currentElement, R(EAX));\
@ -266,24 +144,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
ABI_PARAM1 address
ABI_PARAM2 address where registers are stored
ABI_PARAM3 how many values to read/write
ABI_PARAM4 code cycles
Dolphin x64CodeEmitter is my favourite assembler
*/
void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
{
const u8* zero = GetCodePtr();
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
RET();
void* res = (void*)GetWritableCodePtr();
TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
J_CC(CC_Z, zero);
PUSH(ABI_PARAM3);
PUSH(ABI_PARAM4); // we need you later
const u8* repeat = GetCodePtr();
if (preinc)
@ -311,12 +178,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
MEMORY_SEQ_WHILE_COND
MOV(32, R(RSCRATCH), R(ABI_PARAM1));
SHR(32, R(RSCRATCH), Imm8(12));
MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings)));
MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings)));
FixupBranch finishIt1 = J();
RET();
SetJumpTarget(insideDTCM);
AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3));
@ -329,9 +191,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
MEMORY_SEQ_WHILE_COND
MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time
MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential
FixupBranch finishIt2 = J();
RET();
SetJumpTarget(insideITCM);
MOV(32, R(RSCRATCH), R(ABI_PARAM1));
@ -340,31 +200,23 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
{
MOV(32, R(ABI_PARAM4), currentElement);
MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4));
XOR(32, R(ABI_PARAM4), R(ABI_PARAM4));
MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4));
MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4));
ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM]));
MOV(32, R(ABI_PARAM4), R(RSCRATCH));
SHR(32, R(RSCRATCH), Imm8(8));
SHL(32, R(RSCRATCH), Imm8(4));
CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
FixupBranch noCode = J_CC(CC_Z);
ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
MOV(32, R(ABI_PARAM1), R(ABI_PARAM4));
CALL((u8*)InvalidateByAddr);
ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
SetJumpTarget(noCode);
}
else
MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)));
MEMORY_SEQ_WHILE_COND
MOV(32, R(RSCRATCH), Imm32(1));
MOV(32, R(ABI_PARAM2), Imm32(1));
SetJumpTarget(finishIt1);
SetJumpTarget(finishIt2);
POP(ABI_PARAM4);
POP(ABI_PARAM3);
CMP(32, R(ABI_PARAM3), Imm8(1));
FixupBranch skipSequential = J_CC(CC_E);
SUB(32, R(ABI_PARAM3), Imm8(1));
IMUL(32, RSCRATCH, R(ABI_PARAM3));
ADD(32, R(ABI_PARAM2), R(RSCRATCH));
SetJumpTarget(skipSequential);
CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
RET();
return res;
@ -372,18 +224,8 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
{
const u8* zero = GetCodePtr();
ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
RET();
void* res = (void*)GetWritableCodePtr();
TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
J_CC(CC_Z, zero);
PUSH(ABI_PARAM3);
PUSH(ABI_PARAM4); // we need you later
const u8* repeat = GetCodePtr();
if (preinc)
@ -403,59 +245,227 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
MEMORY_SEQ_WHILE_COND
MOV(32, R(RSCRATCH), R(ABI_PARAM1));
SHR(32, R(RSCRATCH), Imm8(15));
MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings)));
MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings)));
POP(ABI_PARAM4);
POP(ABI_PARAM3);
// TODO: optimise this
CMP(32, R(ABI_PARAM3), Imm8(1));
FixupBranch skipSequential = J_CC(CC_E);
SUB(32, R(ABI_PARAM3), Imm8(1));
IMUL(32, RSCRATCH, R(ABI_PARAM3));
ADD(32, R(ABI_PARAM2), R(RSCRATCH));
SetJumpTarget(skipSequential);
MOV(32, R(RSCRATCH), R(ABI_PARAM1));
AND(32, R(RSCRATCH), Imm32(0xFF000000));
CMP(32, R(RSCRATCH), Imm32(0x02000000));
FixupBranch outsideMainRAM = J_CC(CC_NE);
CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
RET();
SetJumpTarget(outsideMainRAM);
CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
RET();
return res;
}
#undef CALC_CYCLES_9
#undef MEMORY_SEQ_WHILE_COND
void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
{
IrregularCycles = true;
if (store)
MOV(32, R(ABI_PARAM2), rd);
u32 cycles = Num
? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
: (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
MOV(32, R(ABI_PARAM3), Imm32(cycles));
CALL(Num == 0
? MemoryFuncs9[size >> 4][store]
: MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]);
if (!store)
u32 val;
// make sure arm7 bios is accessible
u32 tmpR15 = CurCPU->R[15];
CurCPU->R[15] = R15;
if (size == 32)
{
if (signExtend)
MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
CurCPU->DataRead32(addr & ~0x3, &val);
val = ROR(val, (addr & 0x3) << 3);
}
else if (size == 16)
CurCPU->DataRead16(addr & ~0x1, &val);
else
CurCPU->DataRead8(addr, &val);
CurCPU->R[15] = tmpR15;
MOV(32, MapReg(rd), Imm32(val));
if (Thumb || CurInstr.Cond() == 0xE)
RegCache.PutLiteral(rd, val);
Comp_AddCycles_CDI();
}
void fault(u32 a, u32 b)
{
printf("actually not static! %x %x\n", a, b);
}
void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
{
if (flags & memop_Store)
{
Comp_AddCycles_CD();
}
else
{
Comp_AddCycles_CDI();
}
u32 addressMask = ~0;
if (size == 32)
addressMask = ~3;
if (size == 16)
addressMask = ~1;
if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
{
Comp_MemLoadLiteral(size, rd,
R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1));
}
else
{
OpArg rdMapped = MapReg(rd);
OpArg rnMapped = MapReg(rn);
bool inlinePreparation = Num == 1;
u32 constLocalROR32 = 4;
void* memoryFunc = Num == 0
? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
: MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
{
u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
/*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr));
MOV(32, R(ABI_PARAM1), Imm32(R15));
MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
CMP(32, R(RSCRATCH), Imm32(addr));
FixupBranch eq = J_CC(CC_E);
CALL((void*)fault);
SetJumpTarget(eq);*/
NDS::MemRegion region;
region.Mem = NULL;
if (Num == 0)
{
ARMv5* cpu5 = (ARMv5*)CurCPU;
// stupid dtcm...
if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
{
region.Mem = cpu5->DTCM;
region.Mask = 0x3FFF;
}
else
{
NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
}
}
else
NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
if (region.Mem != NULL)
{
void* ptr = &region.Mem[addr & addressMask & region.Mask];
if (flags & memop_Store)
{
MOV(size, M(ptr), MapReg(rd));
}
else
{
if (flags & memop_SignExtend)
MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr));
else
MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr));
if (size == 32 && addr & ~0x3)
{
ROR_(32, rdMapped, Imm8((addr & 0x3) << 3));
}
}
return;
}
void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
if (specialFunc)
{
memoryFunc = specialFunc;
inlinePreparation = true;
constLocalROR32 = addr & 0x3;
}
}
X64Reg finalAddr = ABI_PARAM1;
if (flags & memop_Post)
{
MOV(32, R(ABI_PARAM1), rnMapped);
finalAddr = rnMapped.GetSimpleReg();
}
if (op2.IsImm)
{
MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
}
else
MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
{
OpArg rm = MapReg(op2.Reg.Reg);
if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
&& op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
{
LEA(32, finalAddr,
MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
}
else
{
bool throwAway;
OpArg offset =
Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
if (flags & memop_SubtractOffset)
{
MOV(32, R(finalAddr), rnMapped);
if (!offset.IsZero())
SUB(32, R(finalAddr), offset);
}
else
MOV_sum(32, finalAddr, rnMapped, offset);
}
}
if ((flags & memop_Writeback) && !(flags & memop_Post))
MOV(32, rnMapped, R(finalAddr));
if (flags & memop_Store)
MOV(32, R(ABI_PARAM2), rdMapped);
if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32)
MOV(32, rdMapped, R(ABI_PARAM1));
if (inlinePreparation && size > 8)
AND(32, R(ABI_PARAM1), Imm8(addressMask));
CALL(memoryFunc);
if (!(flags & memop_Store))
{
if (inlinePreparation && size == 32)
{
if (constLocalROR32 == 4)
{
static_assert(RSCRATCH3 == ECX);
MOV(32, R(ECX), rdMapped);
AND(32, R(ECX), Imm8(3));
SHL(32, R(ECX), Imm8(3));
ROR_(32, R(RSCRATCH), R(ECX));
}
else if (constLocalROR32 != 0)
ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3));
}
if (flags & memop_SignExtend)
MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
else
MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
}
if (!(flags & memop_Store) && rd == 15)
{
if (size < 32)
printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr);
{
if (Num == 1)
AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended
Comp_JumpTo(rdMapped.GetSimpleReg());
}
}
}
}
@ -475,16 +485,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
u32 cycles = Num
? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
: (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
// we need to make sure that the stack stays aligned to 16 bytes
u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
MOV(32, R(ABI_PARAM4), Imm32(cycles));
if (!store)
{
Comp_AddCycles_CDI();
MOV(32, R(ABI_PARAM3), Imm32(regsCount));
SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
MOV(64, R(ABI_PARAM2), R(RSP));
@ -548,6 +555,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
}
else
{
Comp_AddCycles_CD();
if (regsCount & 1)
PUSH(RSCRATCH);
@ -594,81 +603,45 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
return offset;
}
OpArg Compiler::A_Comp_GetMemWBOffset()
void Compiler::A_Comp_MemWB()
{
bool load = CurInstr.Instr & (1 << 20);
bool byte = CurInstr.Instr & (1 << 22);
int size = byte ? 8 : 32;
int flags = 0;
if (!load)
flags |= memop_Store;
if (!(CurInstr.Instr & (1 << 24)))
flags |= memop_Post;
if (CurInstr.Instr & (1 << 21))
flags |= memop_Writeback;
if (!(CurInstr.Instr & (1 << 23)))
flags |= memop_SubtractOffset;
ComplexOperand offset;
if (!(CurInstr.Instr & (1 << 25)))
{
u32 imm = CurInstr.Instr & 0xFFF;
return Imm32(imm);
offset = ComplexOperand(CurInstr.Instr & 0xFFF);
}
else
{
int op = (CurInstr.Instr >> 5) & 0x3;
int amount = (CurInstr.Instr >> 7) & 0x1F;
OpArg rm = MapReg(CurInstr.A_Reg(0));
bool carryUsed;
int rm = CurInstr.A_Reg(0);
return Comp_RegShiftImm(op, amount, rm, false, carryUsed);
}
}
void Compiler::A_Comp_MemWB()
{
OpArg rn = MapReg(CurInstr.A_Reg(16));
OpArg rd = MapReg(CurInstr.A_Reg(12));
bool load = CurInstr.Instr & (1 << 20);
bool byte = CurInstr.Instr & (1 << 22);
int size = byte ? 8 : 32;
if (CurInstr.Instr & (1 << 24))
{
OpArg offset = A_Comp_GetMemWBOffset();
if (CurInstr.Instr & (1 << 23))
MOV_sum(32, ABI_PARAM1, rn, offset);
else
{
MOV(32, R(ABI_PARAM1), rn);
SUB(32, R(ABI_PARAM1), offset);
}
if (CurInstr.Instr & (1 << 21))
MOV(32, rn, R(ABI_PARAM1));
}
else
MOV(32, R(ABI_PARAM1), rn);
if (!(CurInstr.Instr & (1 << 24)))
{
OpArg offset = A_Comp_GetMemWBOffset();
if (CurInstr.Instr & (1 << 23))
ADD(32, rn, offset);
else
SUB(32, rn, offset);
offset = ComplexOperand(rm, op, amount);
}
Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
if (load && CurInstr.A_Reg(12) == 15)
{
if (byte)
printf("!!! LDRB PC %08X\n", R15);
else
{
if (Num == 1)
AND(32, rd, Imm8(0xFE)); // immediate is sign extended
Comp_JumpTo(rd.GetSimpleReg());
}
}
Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
}
void Compiler::A_Comp_MemHalf()
{
OpArg rn = MapReg(CurInstr.A_Reg(16));
OpArg rd = MapReg(CurInstr.A_Reg(12));
OpArg offset = CurInstr.Instr & (1 << 22)
? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
: MapReg(CurInstr.A_Reg(0));
ComplexOperand offset = CurInstr.Instr & (1 << 22)
? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
: ComplexOperand(CurInstr.A_Reg(0), 0, 0);
int op = (CurInstr.Instr >> 5) & 0x3;
bool load = CurInstr.Instr & (1 << 20);
@ -689,49 +662,29 @@ void Compiler::A_Comp_MemHalf()
if (size == 32 && Num == 1)
return; // NOP
if (CurInstr.Instr & (1 << 24))
{
if (CurInstr.Instr & (1 << 23))
MOV_sum(32, ABI_PARAM1, rn, offset);
else
{
MOV(32, R(ABI_PARAM1), rn);
SUB(32, R(ABI_PARAM1), offset);
}
if (CurInstr.Instr & (1 << 21))
MOV(32, rn, R(ABI_PARAM1));
}
else
MOV(32, R(ABI_PARAM1), rn);
int flags = 0;
if (signExtend)
flags |= memop_SignExtend;
if (!load)
flags |= memop_Store;
if (!(CurInstr.Instr & (1 << 24)))
{
if (CurInstr.Instr & (1 << 23))
ADD(32, rn, offset);
else
SUB(32, rn, offset);
}
flags |= memop_Post;
if (!(CurInstr.Instr & (1 << 23)))
flags |= memop_SubtractOffset;
if (CurInstr.Instr & (1 << 21))
flags |= memop_Writeback;
Comp_MemAccess(rd, signExtend, !load, size);
if (load && CurInstr.A_Reg(12) == 15)
printf("!!! MemHalf op PC %08X\n", R15);;
Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
}
void Compiler::T_Comp_MemReg()
{
OpArg rd = MapReg(CurInstr.T_Reg(0));
OpArg rb = MapReg(CurInstr.T_Reg(3));
OpArg ro = MapReg(CurInstr.T_Reg(6));
int op = (CurInstr.Instr >> 10) & 0x3;
bool load = op & 0x2;
bool byte = op & 0x1;
MOV_sum(32, ABI_PARAM1, rb, ro);
Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0),
byte ? 8 : 32, load ? 0 : memop_Store);
}
void Compiler::A_Comp_LDM_STM()
@ -758,67 +711,55 @@ void Compiler::A_Comp_LDM_STM()
void Compiler::T_Comp_MemImm()
{
OpArg rd = MapReg(CurInstr.T_Reg(0));
OpArg rb = MapReg(CurInstr.T_Reg(3));
int op = (CurInstr.Instr >> 11) & 0x3;
bool load = op & 0x1;
bool byte = op & 0x2;
u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset),
byte ? 8 : 32, load ? 0 : memop_Store);
}
void Compiler::T_Comp_MemRegHalf()
{
OpArg rd = MapReg(CurInstr.T_Reg(0));
OpArg rb = MapReg(CurInstr.T_Reg(3));
OpArg ro = MapReg(CurInstr.T_Reg(6));
int op = (CurInstr.Instr >> 10) & 0x3;
bool load = op != 0;
int size = op != 1 ? 16 : 8;
bool signExtend = op & 1;
MOV_sum(32, ABI_PARAM1, rb, ro);
int flags = 0;
if (signExtend)
flags |= memop_SignExtend;
if (!load)
flags |= memop_Store;
Comp_MemAccess(rd, signExtend, !load, size);
Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0),
size, flags);
}
void Compiler::T_Comp_MemImmHalf()
{
OpArg rd = MapReg(CurInstr.T_Reg(0));
OpArg rb = MapReg(CurInstr.T_Reg(3));
u32 offset = (CurInstr.Instr >> 5) & 0x3E;
bool load = CurInstr.Instr & (1 << 11);
LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
Comp_MemAccess(rd, false, !load, 16);
Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16,
load ? 0 : memop_Store);
}
void Compiler::T_Comp_LoadPCRel()
{
OpArg rd = MapReg(CurInstr.T_Reg(8));
u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
// hopefully this doesn't break
u32 val; CurCPU->DataRead32(addr, &val);
MOV(32, rd, Imm32(val));
Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
}
void Compiler::T_Comp_MemSPRel()
{
u32 offset = (CurInstr.Instr & 0xFF) * 4;
OpArg rd = MapReg(CurInstr.T_Reg(8));
bool load = CurInstr.Instr & (1 << 11);
LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset));
Comp_MemAccess(rd, false, !load, 32);
Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32,
load ? 0 : memop_Store);
}
void Compiler::T_Comp_PUSH_POP()

View File

@ -36,7 +36,7 @@ enum {
A_StaticShiftSetC = 1 << 18,
A_SetC = 1 << 19,
A_WriteMemory = 1 << 20,
A_WriteMem = 1 << 20
};
#define A_BIOP A_Read16
@ -109,7 +109,7 @@ const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(
const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy);
const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy);
const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
@ -123,7 +123,7 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
#define A_LDR A_Write12
#define A_STR A_Read12 | A_WriteMemory
#define A_STR A_Read12 | A_WriteMem
#define A_IMPLEMENT_WB_LDRSTR(x,k) \
const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@ -144,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
#define A_LDRD A_Write12Double
#define A_STRD A_Read12Double | A_WriteMemory
#define A_STRD A_Read12Double | A_WriteMem
#define A_IMPLEMENT_HD_LDRSTR(x,k) \
const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@ -159,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP);
const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWPB);
const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP);
const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWPB);
const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM);
const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM);
const u32 A_B = A_BranchAlways | ak(ak_B);
const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
@ -181,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
// THUMB
#define tk(x) ((x) << 21)
#define tk(x) ((x) << 22)
enum {
T_Read0 = 1 << 0,
@ -210,6 +210,8 @@ enum {
T_SetMaybeC = 1 << 18,
T_ReadC = 1 << 19,
T_SetC = 1 << 20,
T_WriteMem = 1 << 21,
};
const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
@ -253,30 +255,30 @@ const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP);
const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG);
const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG);
const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG);
const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG);
const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG);
const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG);
const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG);
const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG);
const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG);
const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG);
const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG);
const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM);
const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM);
const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM);
const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM);
const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM);
const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM);
const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM);
const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM);
const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL);
const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL);
const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH);
const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA);
const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA);
const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
@ -307,7 +309,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
if (thumb)
{
u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
res.Kind = (data >> 21) & 0x3F;
res.Kind = (data >> 22) & 0x3F;
if (data & T_Read0)
res.SrcRegs |= 1 << (instr & 0x7);
@ -356,6 +358,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
if (data & T_SetC)
res.WriteFlags |= flag_C;
if (data & T_WriteMem)
res.SpecialKind = special_WriteMem;
res.EndBlock |= res.Branches();
if (res.Kind == tk_BCOND)
@ -382,6 +387,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
u32 id = (cn<<8)|(cm<<4)|cpinfo;
if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752)
res.EndBlock |= true;
if (id == 0x704 || id == 0x782)
res.SpecialKind = special_WaitForInterrupt;
}
if (res.Kind == ak_MCR || res.Kind == ak_MRC)
{
@ -449,6 +457,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))
res.WriteFlags |= flag_C;
if (data & A_WriteMem)
res.SpecialKind = special_WriteMem;
if ((instr >> 28) < 0xE)
{
// make non conditional flag sets conditional

View File

@ -226,18 +226,27 @@ enum
flag_V = 1 << 0,
};
enum
{
special_NotSpecialAtAll = 0,
special_WriteMem,
special_WaitForInterrupt
};
struct Info
{
u16 DstRegs, SrcRegs;
u16 Kind;
u8 SpecialKind;
u8 ReadFlags;
// lower 4 bits - set always
// upper 4 bits - might set flag
u8 WriteFlags;
bool EndBlock;
bool Branches()
bool Branches() const
{
return DstRegs & (1 << 15);
}

View File

@ -562,9 +562,11 @@ void ARMv5::CP15Write(u32 id, u32 val)
case 0x750:
ARMJIT::InvalidateAll();
ICacheInvalidateAll();
return;
case 0x751:
ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val));
ICacheInvalidateByAddr(val);
return;
case 0x752:
@ -814,7 +816,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
DataCycles = 1;
*(u8*)&ITCM[addr & 0x7FFF] = val;
#ifdef JIT_ENABLED
ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
ARMJIT::InvalidateITCM(addr & 0x7FFF);
#endif
return;
}
@ -838,7 +840,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
DataCycles = 1;
*(u16*)&ITCM[addr & 0x7FFF] = val;
#ifdef JIT_ENABLED
ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
ARMJIT::InvalidateITCM(addr & 0x7FFF);
#endif
return;
}
@ -862,8 +864,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
DataCycles = 1;
*(u32*)&ITCM[addr & 0x7FFF] = val;
#ifdef JIT_ENABLED
ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
ARMJIT::InvalidateITCM(addr & 0x7FFF);
#endif
return;
}
@ -887,8 +888,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
DataCycles += 1;
*(u32*)&ITCM[addr & 0x7FFF] = val;
#ifdef JIT_ENABLED
ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL;
ARMJIT::InvalidateITCM(addr & 0x7FFF);
#endif
return;
}

View File

@ -40,6 +40,7 @@ char DSiNANDPath[1024];
#ifdef JIT_ENABLED
bool JIT_Enable = false;
int JIT_MaxBlockSize = 12;
bool JIT_BrancheOptimisations = true;
#endif
ConfigEntry ConfigFile[] =
@ -56,6 +57,7 @@ ConfigEntry ConfigFile[] =
#ifdef JIT_ENABLED
{"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
{"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
{"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
#endif
{"", -1, NULL, 0, NULL, 0}

View File

@ -54,6 +54,7 @@ extern char DSiNANDPath[1024];
#ifdef JIT_ENABLED
extern bool JIT_Enable;
extern int JIT_MaxBlockSize;
extern bool JIT_BrancheOptimisations;
#endif
}

View File

@ -575,7 +575,7 @@ void Reset()
RCnt = 0;
#ifdef JIT_ENABLED
ARMJIT::InvalidateBlockCache();
ARMJIT::ResetBlockCache();
#endif
NDSCart::Reset();
@ -807,7 +807,7 @@ bool DoSavestate(Savestate* file)
#ifdef JIT_ENABLED
if (!file->Saving)
{
ARMJIT::InvalidateBlockCache();
ARMJIT::ResetBlockCache();
}
#endif
@ -2016,10 +2016,6 @@ u32 ARM9Read32(u32 addr)
void ARM9Write8(u32 addr, u8 val)
{
#ifdef JIT_ENABLED
ARMJIT::Invalidate16<0>(addr);
#endif
switch (addr & 0xFF000000)
{
case 0x02000000:
@ -2070,10 +2066,6 @@ void ARM9Write8(u32 addr, u8 val)
void ARM9Write16(u32 addr, u16 val)
{
#ifdef JIT_ENABLED
ARMJIT::Invalidate16<0>(addr);
#endif
switch (addr & 0xFF000000)
{
case 0x02000000:
@ -2140,10 +2132,6 @@ void ARM9Write16(u32 addr, u16 val)
void ARM9Write32(u32 addr, u32 val)
{
#ifdef JIT_ENABLED
ARMJIT::Invalidate32<0>(addr);
#endif
switch (addr & 0xFF000000)
{
case 0x02000000:
@ -2439,7 +2427,7 @@ u32 ARM7Read32(u32 addr)
void ARM7Write8(u32 addr, u8 val)
{
#ifdef JIT_ENABLED
ARMJIT::Invalidate16<1>(addr);
ARMJIT::InvalidateByAddr7(addr);
#endif
switch (addr & 0xFF800000)
@ -2502,7 +2490,7 @@ void ARM7Write8(u32 addr, u8 val)
void ARM7Write16(u32 addr, u16 val)
{
#ifdef JIT_ENABLED
ARMJIT::Invalidate16<1>(addr);
ARMJIT::InvalidateByAddr7(addr);
#endif
switch (addr & 0xFF800000)
@ -2575,7 +2563,7 @@ void ARM7Write16(u32 addr, u16 val)
void ARM7Write32(u32 addr, u32 val)
{
#ifdef JIT_ENABLED
ARMJIT::Invalidate32<1>(addr);
ARMJIT::InvalidateByAddr7(addr);
#endif
switch (addr & 0xFF800000)