Jit64: Optimize dcbx being called in a loop over a large memory region.

This commit is contained in:
Admiral H. Curtiss 2021-08-10 20:40:01 +02:00
parent df1e59409b
commit 8b2f5d5006
3 changed files with 116 additions and 6 deletions

View File

@ -234,20 +234,94 @@ void Jit64::dcbx(UGeckoInstruction inst)
INSTRUCTION_START INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff); JITDISABLE(bJITLoadStoreOff);
X64Reg addr = RSCRATCH; // Check if the next instructions match a known looping pattern:
X64Reg value = RSCRATCH2; // - dcbx rX
// - addi rX,rX,32
// - bdnz+ -8
const bool make_loop = inst.RA == 0 && inst.RB != 0 && CanMergeNextInstructions(2) &&
(js.op[1].inst.hex & 0xfc00'ffff) == 0x38000020 &&
js.op[1].inst.RA_6 == inst.RB && js.op[1].inst.RD_2 == inst.RB &&
js.op[2].inst.hex == 0x4200fff8;
RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0); RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0);
RCOpArg Rb = gpr.Use(inst.RB, RCMode::Read); RCX64Reg Rb = gpr.Bind(inst.RB, make_loop ? RCMode::ReadWrite : RCMode::Read);
RCX64Reg tmp = gpr.Scratch(); RCX64Reg tmp = gpr.Scratch();
RCX64Reg effective_address = gpr.Scratch(); RCX64Reg effective_address = gpr.Scratch();
RegCache::Realize(Ra, Rb, tmp, effective_address); RegCache::Realize(Ra, Rb, tmp, effective_address);
// Translate effective address to physical address. RCX64Reg loop_counter;
if (make_loop)
{
// We'll execute somewhere between one single cacheline invalidation and however many are needed
// to reduce the downcount to zero, never exceeding the amount requested by the game.
// To stay consistent with the rest of the code we adjust the involved registers (CTR and Rb)
// by the amount of cache lines we invalidate minus one -- since we'll run the regular addi and
// bdnz afterwards! So if we invalidate a single cache line, we don't adjust the registers at
// all, if we invalidate 2 cachelines we adjust the registers by one step, and so on.
RCX64Reg& reg_cycle_count = tmp;
RCX64Reg& reg_downcount = effective_address;
loop_counter = gpr.Scratch();
RegCache::Realize(loop_counter);
// This must be true in order for us to pick up the DIV results and not trash any data.
static_assert(RSCRATCH == Gen::EAX && RSCRATCH2 == Gen::EDX);
// Alright, now figure out how many loops we want to do.
const u8 cycle_count_per_loop =
js.op[0].opinfo->numCycles + js.op[1].opinfo->numCycles + js.op[2].opinfo->numCycles;
// This is both setting the adjusted loop count to 0 for the downcount <= 0 case and clearing
// the upper bits for the DIV instruction in the downcount > 0 case.
XOR(32, R(RSCRATCH2), R(RSCRATCH2));
MOV(32, R(reg_downcount), PPCSTATE(downcount));
TEST(32, R(reg_downcount), R(reg_downcount)); // if (downcount <= 0)
FixupBranch downcount_is_zero_or_negative = J_CC(CC_LE); // only do 1 invalidation; else:
MOV(32, R(loop_counter), PPCSTATE_CTR);
MOV(32, R(RSCRATCH), R(reg_downcount));
MOV(32, R(reg_cycle_count), Imm32(cycle_count_per_loop));
DIV(32, R(reg_cycle_count)); // RSCRATCH = downcount / cycle_count
LEA(32, RSCRATCH2, MDisp(loop_counter, -1)); // RSCRATCH2 = CTR - 1
// ^ Note that this CTR-1 implicitly handles the CTR == 0 case correctly.
CMP(32, R(RSCRATCH), R(RSCRATCH2));
CMOVcc(32, RSCRATCH2, R(RSCRATCH), CC_B); // RSCRATCH2 = min(RSCRATCH, RSCRATCH2)
// RSCRATCH2 now holds the amount of loops to execute minus 1, which is the amount we need to
// adjust downcount, CTR, and Rb by to exit the loop construct with the right values in those
// registers.
SUB(32, R(loop_counter), R(RSCRATCH2));
MOV(32, PPCSTATE_CTR, R(loop_counter)); // CTR -= RSCRATCH2
MOV(32, R(RSCRATCH), R(RSCRATCH2));
IMUL(32, RSCRATCH, R(reg_cycle_count));
// ^ Note that this cannot overflow because it's limited by (downcount/cycle_count).
SUB(32, R(reg_downcount), R(RSCRATCH));
MOV(32, PPCSTATE(downcount), R(reg_downcount)); // downcount -= (RSCRATCH2 * reg_cycle_count)
SetJumpTarget(downcount_is_zero_or_negative);
// Load the loop_counter register with the amount of invalidations to execute.
LEA(32, loop_counter, MDisp(RSCRATCH2, 1));
}
X64Reg value = RSCRATCH;
MOV_sum(32, value, Ra, Rb); MOV_sum(32, value, Ra, Rb);
if (make_loop)
{
// This is the best place to adjust Rb to what it should be since RSCRATCH2 still has the
// adjusted loop count and we're done reading from Rb.
SHL(32, R(RSCRATCH2), Imm8(5));
ADD(32, R(Rb), R(RSCRATCH2)); // Rb += (RSCRATCH2 * 32)
}
X64Reg addr = RSCRATCH2;
FixupBranch bat_lookup_failed; FixupBranch bat_lookup_failed;
MOV(32, R(effective_address), R(value)); MOV(32, R(effective_address), R(value));
const u8* loop_start = GetCodePtr();
if (MSR.IR) if (MSR.IR)
{ {
// Translate effective address to physical address.
bat_lookup_failed = BATAddressLookup(value, tmp, PowerPC::ibat_table.data()); bat_lookup_failed = BATAddressLookup(value, tmp, PowerPC::ibat_table.data());
MOV(32, R(addr), R(effective_address)); MOV(32, R(addr), R(effective_address));
AND(32, R(addr), Imm32(0x0001ffff)); AND(32, R(addr), Imm32(0x0001ffff));
@ -264,6 +338,14 @@ void Jit64::dcbx(UGeckoInstruction inst)
BT(32, R(value), R(addr)); BT(32, R(value), R(addr));
FixupBranch invalidate_needed = J_CC(CC_C, true); FixupBranch invalidate_needed = J_CC(CC_C, true);
if (make_loop)
{
ADD(32, R(effective_address), Imm8(32));
MOV(32, R(value), R(effective_address));
SUB(32, R(loop_counter), Imm8(1));
J_CC(CC_NZ, loop_start);
}
SwitchToFarCode(); SwitchToFarCode();
SetJumpTarget(invalidate_needed); SetJumpTarget(invalidate_needed);
if (MSR.IR) if (MSR.IR)
@ -272,9 +354,20 @@ void Jit64::dcbx(UGeckoInstruction inst)
BitSet32 registersInUse = CallerSavedRegistersInUse(); BitSet32 registersInUse = CallerSavedRegistersInUse();
registersInUse[X64Reg(tmp)] = false; registersInUse[X64Reg(tmp)] = false;
registersInUse[X64Reg(effective_address)] = false; registersInUse[X64Reg(effective_address)] = false;
if (make_loop)
registersInUse[X64Reg(loop_counter)] = false;
ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
MOV(32, R(ABI_PARAM1), R(effective_address)); if (make_loop)
ABI_CallFunction(JitInterface::InvalidateICacheLine); {
MOV(32, R(ABI_PARAM1), R(effective_address));
MOV(32, R(ABI_PARAM2), R(loop_counter));
ABI_CallFunction(JitInterface::InvalidateICacheLines);
}
else
{
MOV(32, R(ABI_PARAM1), R(effective_address));
ABI_CallFunction(JitInterface::InvalidateICacheLine);
}
ABI_PopRegistersAndAdjustStack(registersInUse, 0); ABI_PopRegistersAndAdjustStack(registersInUse, 0);
asm_routines.ResetStack(*this); asm_routines.ResetStack(*this);

View File

@ -230,6 +230,22 @@ void InvalidateICacheLine(u32 address)
g_jit->GetBlockCache()->InvalidateICacheLine(address); g_jit->GetBlockCache()->InvalidateICacheLine(address);
} }
void InvalidateICacheLines(u32 address, u32 count)
{
// This corresponds to a PPC code loop that:
// - calls some form of dcb* instruction on 'address'
// - increments 'address' by the size of a cache line (0x20 bytes)
// - decrements 'count' by 1
// - jumps back to the dcb* instruction if 'count' != 0
// with an extra optimization for the case of a single cache line invalidation
if (count == 1)
InvalidateICacheLine(address);
if (count == 0 || count >= static_cast<u32>(0x1'0000'0000 / 32))
InvalidateICache(address & ~0x1f, 0xffffffff, false);
else
InvalidateICache(address & ~0x1f, 32 * count, false);
}
void CompileExceptionCheck(ExceptionType type) void CompileExceptionCheck(ExceptionType type)
{ {
if (!g_jit) if (!g_jit)

View File

@ -63,6 +63,7 @@ void ClearSafe();
// If "forced" is true, a recompile is being requested on code that hasn't been modified. // If "forced" is true, a recompile is being requested on code that hasn't been modified.
void InvalidateICache(u32 address, u32 size, bool forced); void InvalidateICache(u32 address, u32 size, bool forced);
void InvalidateICacheLine(u32 address); void InvalidateICacheLine(u32 address);
void InvalidateICacheLines(u32 address, u32 count);
void CompileExceptionCheck(ExceptionType type); void CompileExceptionCheck(ExceptionType type);