Merge pull request #1559 from Sonicadvance1/armv7-minor-optimizations
ARMv7 block profiling + minor optimization
This commit is contained in:
commit
2affe25191
|
@ -388,6 +388,28 @@ void ARMXEmitter::YIELD()
|
||||||
Write32(condition | 0x0320F001);
|
Write32(condition | 0x0320F001);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ARMXEmitter::MRC(u32 coproc, u32 opc1, ARMReg Rt, u32 CRn, u32 CRm, u32 opc2)
|
||||||
|
{
|
||||||
|
_assert_msg_(DYNA_REC, coproc <= 0xF, "%s has co-processor that is %d when it must be under 16!", __FUNCTION__, coproc);
|
||||||
|
_assert_msg_(DYNA_REC, opc1 <= 7, "%s has opc1 that is %d when it must be under 8!", __FUNCTION__, opc1);
|
||||||
|
_assert_msg_(DYNA_REC, CRn <= 0xF, "%s has CRn that is %d when it must be under 16!", __FUNCTION__, CRn);
|
||||||
|
_assert_msg_(DYNA_REC, opc2 <= 7, "%s has opc2 that is %d when it must be under 8!", __FUNCTION__, opc2);
|
||||||
|
|
||||||
|
Write32(condition | (0b1110 << 24) | (opc1 << 21) | (1 << 20) | (CRn << 16) \
|
||||||
|
| (Rt << 12) | (coproc << 8) | (opc2 << 5) | (1 << 4) | CRm);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARMXEmitter::MCR(u32 coproc, u32 opc1, ARMReg Rt, u32 CRn, u32 CRm, u32 opc2)
|
||||||
|
{
|
||||||
|
_assert_msg_(DYNA_REC, coproc <= 0xF, "%s has co-processor that is %d when it must be under 16!", __FUNCTION__, coproc);
|
||||||
|
_assert_msg_(DYNA_REC, opc1 <= 7, "%s has opc1 that is %d when it must be under 8!", __FUNCTION__, opc1);
|
||||||
|
_assert_msg_(DYNA_REC, CRn <= 0xF, "%s has CRn that is %d when it must be under 16!", __FUNCTION__, CRn);
|
||||||
|
_assert_msg_(DYNA_REC, opc2 <= 7, "%s has opc2 that is %d when it must be under 8!", __FUNCTION__, opc2);
|
||||||
|
|
||||||
|
Write32(condition | (0b1110 << 24) | (opc1 << 21) | (CRn << 16) \
|
||||||
|
| (Rt << 12) | (coproc << 8) | (opc2 << 5) | (1 << 4) | CRm);
|
||||||
|
}
|
||||||
|
|
||||||
FixupBranch ARMXEmitter::B()
|
FixupBranch ARMXEmitter::B()
|
||||||
{
|
{
|
||||||
FixupBranch branch;
|
FixupBranch branch;
|
||||||
|
|
|
@ -385,6 +385,10 @@ public:
|
||||||
// Hint instruction
|
// Hint instruction
|
||||||
void YIELD();
|
void YIELD();
|
||||||
|
|
||||||
|
// System
|
||||||
|
void MRC(u32 coproc, u32 opc1, ARMReg Rt, u32 CRn, u32 CRm, u32 opc2 = 0);
|
||||||
|
void MCR(u32 coproc, u32 opc1, ARMReg Rt, u32 CRn, u32 CRm, u32 opc2 = 0);
|
||||||
|
|
||||||
// Do nothing
|
// Do nothing
|
||||||
void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals)
|
void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals)
|
||||||
|
|
||||||
|
|
|
@ -150,6 +150,10 @@ void JitArm::WriteExitDestInR(ARMReg Reg)
|
||||||
STR(Reg, R9, PPCSTATE_OFF(pc));
|
STR(Reg, R9, PPCSTATE_OFF(pc));
|
||||||
Cleanup();
|
Cleanup();
|
||||||
DoDownCount();
|
DoDownCount();
|
||||||
|
|
||||||
|
if (Profiler::g_ProfileBlocks)
|
||||||
|
EndTimeProfile(js.curBlock);
|
||||||
|
|
||||||
MOVI2R(Reg, (u32)asm_routines.dispatcher);
|
MOVI2R(Reg, (u32)asm_routines.dispatcher);
|
||||||
B(Reg);
|
B(Reg);
|
||||||
gpr.Unlock(Reg);
|
gpr.Unlock(Reg);
|
||||||
|
@ -160,6 +164,9 @@ void JitArm::WriteRfiExitDestInR(ARMReg Reg)
|
||||||
Cleanup();
|
Cleanup();
|
||||||
DoDownCount();
|
DoDownCount();
|
||||||
|
|
||||||
|
if (Profiler::g_ProfileBlocks)
|
||||||
|
EndTimeProfile(js.curBlock);
|
||||||
|
|
||||||
ARMReg A = gpr.GetReg(false);
|
ARMReg A = gpr.GetReg(false);
|
||||||
|
|
||||||
LDR(A, R9, PPCSTATE_OFF(pc));
|
LDR(A, R9, PPCSTATE_OFF(pc));
|
||||||
|
@ -177,6 +184,9 @@ void JitArm::WriteExceptionExit()
|
||||||
Cleanup();
|
Cleanup();
|
||||||
DoDownCount();
|
DoDownCount();
|
||||||
|
|
||||||
|
if (Profiler::g_ProfileBlocks)
|
||||||
|
EndTimeProfile(js.curBlock);
|
||||||
|
|
||||||
ARMReg A = gpr.GetReg(false);
|
ARMReg A = gpr.GetReg(false);
|
||||||
|
|
||||||
LDR(A, R9, PPCSTATE_OFF(pc));
|
LDR(A, R9, PPCSTATE_OFF(pc));
|
||||||
|
@ -193,6 +203,10 @@ void JitArm::WriteExit(u32 destination)
|
||||||
Cleanup();
|
Cleanup();
|
||||||
|
|
||||||
DoDownCount();
|
DoDownCount();
|
||||||
|
|
||||||
|
if (Profiler::g_ProfileBlocks)
|
||||||
|
EndTimeProfile(js.curBlock);
|
||||||
|
|
||||||
//If nobody has taken care of this yet (this can be removed when all branches are done)
|
//If nobody has taken care of this yet (this can be removed when all branches are done)
|
||||||
JitBlock *b = js.curBlock;
|
JitBlock *b = js.curBlock;
|
||||||
JitBlock::LinkData linkData;
|
JitBlock::LinkData linkData;
|
||||||
|
@ -273,6 +287,64 @@ void JitArm::Break(UGeckoInstruction inst)
|
||||||
BKPT(0x4444);
|
BKPT(0x4444);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void JitArm::BeginTimeProfile(JitBlock* b)
|
||||||
|
{
|
||||||
|
b->ticCounter = 0;
|
||||||
|
b->ticStart = 0;
|
||||||
|
b->ticStop = 0;
|
||||||
|
|
||||||
|
// Performance counters are bit finnicky on ARM
|
||||||
|
// We must first enable and program the PMU before using it
|
||||||
|
// This is a per core operation so with thread scheduling we may jump to a core we haven't enabled PMU yet
|
||||||
|
// Work around this by enabling PMU each time at the start of a block
|
||||||
|
// Some ARM CPUs are getting absurd core counts(48+!)
|
||||||
|
// We have to reset counters at the start of every block anyway, so may as well.
|
||||||
|
// One thing to note about performance counters on ARM
|
||||||
|
// The kernel can block access to these co-processor registers
|
||||||
|
// In the case that this happens, these will generate a SIGILL
|
||||||
|
|
||||||
|
// Refer to the ARM ARM about PMCR for what these do exactly
|
||||||
|
enum
|
||||||
|
{
|
||||||
|
PERF_OPTION_ENABLE = (1 << 0),
|
||||||
|
PERF_OPTION_RESET_CR = (1 << 1),
|
||||||
|
PERF_OPTION_RESET_CCR = (1 << 2),
|
||||||
|
PERF_OPTION_DIVIDER_MODE = (1 << 3),
|
||||||
|
PERF_OPTION_EXPORT_ENABLE = (1 << 4),
|
||||||
|
};
|
||||||
|
const u32 perf_options =
|
||||||
|
PERF_OPTION_ENABLE |
|
||||||
|
PERF_OPTION_RESET_CR |
|
||||||
|
PERF_OPTION_RESET_CCR |
|
||||||
|
PERF_OPTION_EXPORT_ENABLE;
|
||||||
|
MOVI2R(R0, perf_options);
|
||||||
|
// Programs the PMCR
|
||||||
|
MCR(15, 0, R0, 9, 12, 0);
|
||||||
|
|
||||||
|
MOVI2R(R0, 0x8000000F);
|
||||||
|
// Enables all counters
|
||||||
|
MCR(15, 0, R0, 9, 12, 1);
|
||||||
|
// Clears all counter overflows
|
||||||
|
MCR(15, 0, R0, 9, 12, 3);
|
||||||
|
|
||||||
|
// Gets the cycle counter
|
||||||
|
MRC(15, 0, R1, 9, 13, 0);
|
||||||
|
MOVI2R(R0, (u32)&b->ticStart);
|
||||||
|
STR(R1, R0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void JitArm::EndTimeProfile(JitBlock* b)
|
||||||
|
{
|
||||||
|
// Gets the cycle counter
|
||||||
|
MRC(15, 0, R1, 9, 13, 0);
|
||||||
|
MOVI2R(R0, (u32)&b->ticStop);
|
||||||
|
STR(R1, R0, 0);
|
||||||
|
|
||||||
|
MOVI2R(R0, (u32)&b->ticStart);
|
||||||
|
MOVI2R(R14, (u32)asm_routines.m_increment_profile_counter);
|
||||||
|
BL(R14);
|
||||||
|
}
|
||||||
|
|
||||||
const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlock *b)
|
const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlock *b)
|
||||||
{
|
{
|
||||||
int blockSize = code_buf->GetSize();
|
int blockSize = code_buf->GetSize();
|
||||||
|
@ -362,8 +434,7 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
|
||||||
LDR(rB, rA); // Load the actual value in to R11.
|
LDR(rB, rA); // Load the actual value in to R11.
|
||||||
ADD(rB, rB, 1); // Add one to the value
|
ADD(rB, rB, 1); // Add one to the value
|
||||||
STR(rB, rA); // Now store it back in the memory location
|
STR(rB, rA); // Now store it back in the memory location
|
||||||
// get start tic
|
BeginTimeProfile(b);
|
||||||
PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStart);
|
|
||||||
gpr.Unlock(rA, rB);
|
gpr.Unlock(rA, rB);
|
||||||
}
|
}
|
||||||
gpr.Start(js.gpa);
|
gpr.Start(js.gpa);
|
||||||
|
@ -390,16 +461,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
|
||||||
// WARNING - cmp->branch merging will screw this up.
|
// WARNING - cmp->branch merging will screw this up.
|
||||||
js.isLastInstruction = true;
|
js.isLastInstruction = true;
|
||||||
js.next_inst = 0;
|
js.next_inst = 0;
|
||||||
if (Profiler::g_ProfileBlocks)
|
|
||||||
{
|
|
||||||
// CAUTION!!! push on stack regs you use, do your stuff, then pop
|
|
||||||
PROFILER_VPUSH;
|
|
||||||
// get end tic
|
|
||||||
PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStop);
|
|
||||||
// tic counter += (end tic - start tic)
|
|
||||||
PROFILER_UPDATE_TIME(&b);
|
|
||||||
PROFILER_VPOP;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -416,26 +477,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
|
||||||
POP(4, R0, R1, R2, R3);
|
POP(4, R0, R1, R2, R3);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Profiler::g_ProfileBlocks)
|
|
||||||
{
|
|
||||||
// Add run count
|
|
||||||
static const u64 One = 1;
|
|
||||||
ARMReg RA = gpr.GetReg();
|
|
||||||
ARMReg RB = gpr.GetReg();
|
|
||||||
ARMReg VA = fpr.GetReg();
|
|
||||||
ARMReg VB = fpr.GetReg();
|
|
||||||
MOVI2R(RA, (u32)&opinfo->runCount);
|
|
||||||
MOVI2R(RB, (u32)&One);
|
|
||||||
VLDR(VA, RA, 0);
|
|
||||||
VLDR(VB, RB, 0);
|
|
||||||
NEONXEmitter nemit(this);
|
|
||||||
nemit.VADD(I_64, VA, VA, VB);
|
|
||||||
VSTR(VA, RA, 0);
|
|
||||||
gpr.Unlock(RA, RB);
|
|
||||||
fpr.Unlock(VA);
|
|
||||||
fpr.Unlock(VB);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!ops[i].skip)
|
if (!ops[i].skip)
|
||||||
{
|
{
|
||||||
if (js.memcheck && (opinfo->flags & FL_USE_FPU))
|
if (js.memcheck && (opinfo->flags & FL_USE_FPU))
|
||||||
|
@ -444,6 +485,13 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
|
||||||
BKPT(0x7777);
|
BKPT(0x7777);
|
||||||
}
|
}
|
||||||
JitArmTables::CompileInstruction(ops[i]);
|
JitArmTables::CompileInstruction(ops[i]);
|
||||||
|
|
||||||
|
// If we have a register that will never be used again, flush it.
|
||||||
|
for (int j : ~ops[i].gprInUse)
|
||||||
|
gpr.StoreFromRegister(j);
|
||||||
|
for (int j : ~ops[i].fprInUse)
|
||||||
|
fpr.StoreFromRegister(j);
|
||||||
|
|
||||||
if (js.memcheck && (opinfo->flags & FL_LOADSTORE))
|
if (js.memcheck && (opinfo->flags & FL_LOADSTORE))
|
||||||
{
|
{
|
||||||
// Don't do this yet
|
// Don't do this yet
|
||||||
|
|
|
@ -58,6 +58,10 @@ private:
|
||||||
ArmGen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);
|
ArmGen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);
|
||||||
|
|
||||||
bool BackPatch(SContext* ctx);
|
bool BackPatch(SContext* ctx);
|
||||||
|
|
||||||
|
void BeginTimeProfile(JitBlock* b);
|
||||||
|
void EndTimeProfile(JitBlock* b);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
JitArm() : code_buffer(32000) {}
|
JitArm() : code_buffer(32000) {}
|
||||||
~JitArm() {}
|
~JitArm() {}
|
||||||
|
|
|
@ -609,4 +609,15 @@ void JitArmAsmRoutineManager::GenerateCommon()
|
||||||
pairedStoreQuantized[14] = storeSingleS8;
|
pairedStoreQuantized[14] = storeSingleS8;
|
||||||
pairedStoreQuantized[15] = storeSingleS16;
|
pairedStoreQuantized[15] = storeSingleS16;
|
||||||
|
|
||||||
|
m_increment_profile_counter = AlignCode16();
|
||||||
|
|
||||||
|
nemit.VLD1(I_64, D0, R0); // Start
|
||||||
|
ADD(R0, R0, 8);
|
||||||
|
nemit.VLD1(I_64, D1, R0); // End
|
||||||
|
ADD(R0, R0, 8);
|
||||||
|
nemit.VLD1(I_64, D2, R0); // Counter
|
||||||
|
nemit.VSUB(I_64, D1, D1, D0);
|
||||||
|
nemit.VADD(I_64, D2, D2, D1);
|
||||||
|
nemit.VST1(I_64, D2, R0);
|
||||||
|
MOV(_PC, _LR);
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,6 +14,8 @@ private:
|
||||||
void GenerateCommon();
|
void GenerateCommon();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
const u8* m_increment_profile_counter;
|
||||||
|
|
||||||
void Init()
|
void Init()
|
||||||
{
|
{
|
||||||
AllocCodeSpace(8192);
|
AllocCodeSpace(8192);
|
||||||
|
|
|
@ -161,6 +161,7 @@ ARMReg ArmFPRCache::GetPPCReg(u32 preg, bool PS1, bool preLoad)
|
||||||
ArmCRegs[regindex].PS1 = PS1;
|
ArmCRegs[regindex].PS1 = PS1;
|
||||||
|
|
||||||
_regs[preg][PS1].LoadToReg(regindex);
|
_regs[preg][PS1].LoadToReg(regindex);
|
||||||
|
if (preLoad)
|
||||||
emit->VLDR(ArmCRegs[regindex].Reg, R9, offset);
|
emit->VLDR(ArmCRegs[regindex].Reg, R9, offset);
|
||||||
return ArmCRegs[regindex].Reg;
|
return ArmCRegs[regindex].Reg;
|
||||||
}
|
}
|
||||||
|
@ -178,6 +179,7 @@ ARMReg ArmFPRCache::GetPPCReg(u32 preg, bool PS1, bool preLoad)
|
||||||
ArmCRegs[lastRegIndex].PS1 = PS1;
|
ArmCRegs[lastRegIndex].PS1 = PS1;
|
||||||
|
|
||||||
_regs[preg][PS1].LoadToReg(lastRegIndex);
|
_regs[preg][PS1].LoadToReg(lastRegIndex);
|
||||||
|
if (preLoad)
|
||||||
emit->VLDR(ArmCRegs[lastRegIndex].Reg, R9, offsetNew);
|
emit->VLDR(ArmCRegs[lastRegIndex].Reg, R9, offsetNew);
|
||||||
return ArmCRegs[lastRegIndex].Reg;
|
return ArmCRegs[lastRegIndex].Reg;
|
||||||
}
|
}
|
||||||
|
@ -225,3 +227,26 @@ void ArmFPRCache::Flush(FlushMode mode)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ArmFPRCache::StoreFromRegister(u32 preg)
|
||||||
|
{
|
||||||
|
if (_regs[preg][0].GetType() != REG_NOTLOADED)
|
||||||
|
{
|
||||||
|
s16 offset = PPCSTATE_OFF(ps) + (preg * 16);
|
||||||
|
u32 regindex = _regs[preg][0].GetRegIndex();
|
||||||
|
emit->VSTR(ArmCRegs[regindex].Reg, R9, offset);
|
||||||
|
|
||||||
|
ArmCRegs[regindex].PPCReg = 33;
|
||||||
|
ArmCRegs[regindex].LastLoad = 0;
|
||||||
|
_regs[preg][0].Flush();
|
||||||
|
}
|
||||||
|
if (_regs[preg][1].GetType() != REG_NOTLOADED)
|
||||||
|
{
|
||||||
|
s16 offset = PPCSTATE_OFF(ps) + (preg * 16) + 8;
|
||||||
|
u32 regindex = _regs[preg][1].GetRegIndex();
|
||||||
|
emit->VSTR(ArmCRegs[regindex].Reg, R9, offset);
|
||||||
|
|
||||||
|
ArmCRegs[regindex].PPCReg = 33;
|
||||||
|
ArmCRegs[regindex].LastLoad = 0;
|
||||||
|
_regs[preg][1].Flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -45,4 +45,6 @@ public:
|
||||||
void Flush(FlushMode mode = FLUSH_ALL);
|
void Flush(FlushMode mode = FLUSH_ALL);
|
||||||
ArmGen::ARMReg R0(u32 preg, bool preLoad = true); // Returns a cached register
|
ArmGen::ARMReg R0(u32 preg, bool preLoad = true); // Returns a cached register
|
||||||
ArmGen::ARMReg R1(u32 preg, bool preLoad = true);
|
ArmGen::ARMReg R1(u32 preg, bool preLoad = true);
|
||||||
|
|
||||||
|
void StoreFromRegister(u32 preg);
|
||||||
};
|
};
|
||||||
|
|
|
@ -300,3 +300,20 @@ void ArmRegCache::Flush(FlushMode mode)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ArmRegCache::StoreFromRegister(u32 preg)
|
||||||
|
{
|
||||||
|
if (regs[preg].GetType() == REG_IMM)
|
||||||
|
{
|
||||||
|
// This changes the type over to a REG_REG and gets caught below.
|
||||||
|
BindToRegister(preg, true, true);
|
||||||
|
}
|
||||||
|
if (regs[preg].GetType() == REG_REG)
|
||||||
|
{
|
||||||
|
u32 regindex = regs[preg].GetRegIndex();
|
||||||
|
emit->STR(ArmCRegs[regindex].Reg, R9, PPCSTATE_OFF(gpr) + preg * 4);
|
||||||
|
|
||||||
|
ArmCRegs[regindex].PPCReg = 33;
|
||||||
|
ArmCRegs[regindex].LastLoad = 0;
|
||||||
|
regs[preg].Flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -135,4 +135,6 @@ public:
|
||||||
// Public function doesn't kill immediates
|
// Public function doesn't kill immediates
|
||||||
// In reality when you call R(u32) it'll bind an immediate there
|
// In reality when you call R(u32) it'll bind an immediate there
|
||||||
void BindToRegister(u32 preg, bool doLoad = true);
|
void BindToRegister(u32 preg, bool doLoad = true);
|
||||||
|
|
||||||
|
void StoreFromRegister(u32 preg);
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue