Merge pull request #10007 from AdmiralCurtiss/x64-dcbx-in-loop
Jit64: dcbx loop detection for improved performance when invalidating large memory regions.
This commit is contained in:
commit
d162015112
|
@ -234,20 +234,94 @@ void Jit64::dcbx(UGeckoInstruction inst)
|
|||
INSTRUCTION_START
|
||||
JITDISABLE(bJITLoadStoreOff);
|
||||
|
||||
X64Reg addr = RSCRATCH;
|
||||
X64Reg value = RSCRATCH2;
|
||||
// Check if the next instructions match a known looping pattern:
|
||||
// - dcbx rX
|
||||
// - addi rX,rX,32
|
||||
// - bdnz+ -8
|
||||
const bool make_loop = inst.RA == 0 && inst.RB != 0 && CanMergeNextInstructions(2) &&
|
||||
(js.op[1].inst.hex & 0xfc00'ffff) == 0x38000020 &&
|
||||
js.op[1].inst.RA_6 == inst.RB && js.op[1].inst.RD_2 == inst.RB &&
|
||||
js.op[2].inst.hex == 0x4200fff8;
|
||||
|
||||
RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0);
|
||||
RCOpArg Rb = gpr.Use(inst.RB, RCMode::Read);
|
||||
RCX64Reg Rb = gpr.Bind(inst.RB, make_loop ? RCMode::ReadWrite : RCMode::Read);
|
||||
RCX64Reg tmp = gpr.Scratch();
|
||||
RCX64Reg effective_address = gpr.Scratch();
|
||||
RegCache::Realize(Ra, Rb, tmp, effective_address);
|
||||
|
||||
// Translate effective address to physical address.
|
||||
RCX64Reg loop_counter;
|
||||
if (make_loop)
|
||||
{
|
||||
// We'll execute somewhere between one single cacheline invalidation and however many are needed
|
||||
// to reduce the downcount to zero, never exceeding the amount requested by the game.
|
||||
// To stay consistent with the rest of the code we adjust the involved registers (CTR and Rb)
|
||||
// by the amount of cache lines we invalidate minus one -- since we'll run the regular addi and
|
||||
// bdnz afterwards! So if we invalidate a single cache line, we don't adjust the registers at
|
||||
// all, if we invalidate 2 cachelines we adjust the registers by one step, and so on.
|
||||
|
||||
RCX64Reg& reg_cycle_count = tmp;
|
||||
RCX64Reg& reg_downcount = effective_address;
|
||||
loop_counter = gpr.Scratch();
|
||||
RegCache::Realize(loop_counter);
|
||||
|
||||
// This must be true in order for us to pick up the DIV results and not trash any data.
|
||||
static_assert(RSCRATCH == Gen::EAX && RSCRATCH2 == Gen::EDX);
|
||||
|
||||
// Alright, now figure out how many loops we want to do.
|
||||
const u8 cycle_count_per_loop =
|
||||
js.op[0].opinfo->numCycles + js.op[1].opinfo->numCycles + js.op[2].opinfo->numCycles;
|
||||
|
||||
// This is both setting the adjusted loop count to 0 for the downcount <= 0 case and clearing
|
||||
// the upper bits for the DIV instruction in the downcount > 0 case.
|
||||
XOR(32, R(RSCRATCH2), R(RSCRATCH2));
|
||||
|
||||
MOV(32, R(reg_downcount), PPCSTATE(downcount));
|
||||
TEST(32, R(reg_downcount), R(reg_downcount)); // if (downcount <= 0)
|
||||
FixupBranch downcount_is_zero_or_negative = J_CC(CC_LE); // only do 1 invalidation; else:
|
||||
MOV(32, R(loop_counter), PPCSTATE_CTR);
|
||||
MOV(32, R(RSCRATCH), R(reg_downcount));
|
||||
MOV(32, R(reg_cycle_count), Imm32(cycle_count_per_loop));
|
||||
DIV(32, R(reg_cycle_count)); // RSCRATCH = downcount / cycle_count
|
||||
LEA(32, RSCRATCH2, MDisp(loop_counter, -1)); // RSCRATCH2 = CTR - 1
|
||||
// ^ Note that this CTR-1 implicitly handles the CTR == 0 case correctly.
|
||||
CMP(32, R(RSCRATCH), R(RSCRATCH2));
|
||||
CMOVcc(32, RSCRATCH2, R(RSCRATCH), CC_B); // RSCRATCH2 = min(RSCRATCH, RSCRATCH2)
|
||||
|
||||
// RSCRATCH2 now holds the amount of loops to execute minus 1, which is the amount we need to
|
||||
// adjust downcount, CTR, and Rb by to exit the loop construct with the right values in those
|
||||
// registers.
|
||||
SUB(32, R(loop_counter), R(RSCRATCH2));
|
||||
MOV(32, PPCSTATE_CTR, R(loop_counter)); // CTR -= RSCRATCH2
|
||||
MOV(32, R(RSCRATCH), R(RSCRATCH2));
|
||||
IMUL(32, RSCRATCH, R(reg_cycle_count));
|
||||
// ^ Note that this cannot overflow because it's limited by (downcount/cycle_count).
|
||||
SUB(32, R(reg_downcount), R(RSCRATCH));
|
||||
MOV(32, PPCSTATE(downcount), R(reg_downcount)); // downcount -= (RSCRATCH2 * reg_cycle_count)
|
||||
|
||||
SetJumpTarget(downcount_is_zero_or_negative);
|
||||
|
||||
// Load the loop_counter register with the amount of invalidations to execute.
|
||||
LEA(32, loop_counter, MDisp(RSCRATCH2, 1));
|
||||
}
|
||||
|
||||
X64Reg value = RSCRATCH;
|
||||
MOV_sum(32, value, Ra, Rb);
|
||||
|
||||
if (make_loop)
|
||||
{
|
||||
// This is the best place to adjust Rb to what it should be since RSCRATCH2 still has the
|
||||
// adjusted loop count and we're done reading from Rb.
|
||||
SHL(32, R(RSCRATCH2), Imm8(5));
|
||||
ADD(32, R(Rb), R(RSCRATCH2)); // Rb += (RSCRATCH2 * 32)
|
||||
}
|
||||
|
||||
X64Reg addr = RSCRATCH2;
|
||||
FixupBranch bat_lookup_failed;
|
||||
MOV(32, R(effective_address), R(value));
|
||||
const u8* loop_start = GetCodePtr();
|
||||
if (MSR.IR)
|
||||
{
|
||||
// Translate effective address to physical address.
|
||||
bat_lookup_failed = BATAddressLookup(value, tmp, PowerPC::ibat_table.data());
|
||||
MOV(32, R(addr), R(effective_address));
|
||||
AND(32, R(addr), Imm32(0x0001ffff));
|
||||
|
@ -264,6 +338,14 @@ void Jit64::dcbx(UGeckoInstruction inst)
|
|||
BT(32, R(value), R(addr));
|
||||
FixupBranch invalidate_needed = J_CC(CC_C, true);
|
||||
|
||||
if (make_loop)
|
||||
{
|
||||
ADD(32, R(effective_address), Imm8(32));
|
||||
MOV(32, R(value), R(effective_address));
|
||||
SUB(32, R(loop_counter), Imm8(1));
|
||||
J_CC(CC_NZ, loop_start);
|
||||
}
|
||||
|
||||
SwitchToFarCode();
|
||||
SetJumpTarget(invalidate_needed);
|
||||
if (MSR.IR)
|
||||
|
@ -272,9 +354,20 @@ void Jit64::dcbx(UGeckoInstruction inst)
|
|||
BitSet32 registersInUse = CallerSavedRegistersInUse();
|
||||
registersInUse[X64Reg(tmp)] = false;
|
||||
registersInUse[X64Reg(effective_address)] = false;
|
||||
if (make_loop)
|
||||
registersInUse[X64Reg(loop_counter)] = false;
|
||||
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
|
||||
MOV(32, R(ABI_PARAM1), R(effective_address));
|
||||
ABI_CallFunction(JitInterface::InvalidateICacheLine);
|
||||
if (make_loop)
|
||||
{
|
||||
MOV(32, R(ABI_PARAM1), R(effective_address));
|
||||
MOV(32, R(ABI_PARAM2), R(loop_counter));
|
||||
ABI_CallFunction(JitInterface::InvalidateICacheLines);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOV(32, R(ABI_PARAM1), R(effective_address));
|
||||
ABI_CallFunction(JitInterface::InvalidateICacheLine);
|
||||
}
|
||||
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
|
||||
asm_routines.ResetStack(*this);
|
||||
|
||||
|
|
|
@ -183,16 +183,51 @@ const u8* JitBaseBlockCache::Dispatch()
|
|||
return block->normalEntry;
|
||||
}
|
||||
|
||||
void JitBaseBlockCache::InvalidateICache(u32 address, u32 length, bool forced)
|
||||
void JitBaseBlockCache::InvalidateICacheLine(u32 address)
|
||||
{
|
||||
const auto translated = PowerPC::JitCache_TranslateAddress(address);
|
||||
if (!translated.valid)
|
||||
return;
|
||||
const u32 physical_address = translated.address;
|
||||
const u32 cache_line_address = address & ~0x1f;
|
||||
const auto translated = PowerPC::JitCache_TranslateAddress(cache_line_address);
|
||||
if (translated.valid)
|
||||
InvalidateICacheInternal(translated.address, cache_line_address, 32, false);
|
||||
}
|
||||
|
||||
// Optimize the common case of length == 32 which is used by Interpreter::dcb*
|
||||
void JitBaseBlockCache::InvalidateICache(u32 initial_address, u32 initial_length, bool forced)
|
||||
{
|
||||
u32 address = initial_address;
|
||||
u32 length = initial_length;
|
||||
while (length > 0)
|
||||
{
|
||||
const auto translated = PowerPC::JitCache_TranslateAddress(address);
|
||||
|
||||
const bool address_from_bat = translated.valid && translated.translated && translated.from_bat;
|
||||
const int shift = address_from_bat ? PowerPC::BAT_INDEX_SHIFT : PowerPC::HW_PAGE_INDEX_SHIFT;
|
||||
const u32 mask = ~((1u << shift) - 1u);
|
||||
const u32 first_address = address;
|
||||
const u32 last_address = address + (length - 1u);
|
||||
if ((first_address & mask) == (last_address & mask))
|
||||
{
|
||||
if (translated.valid)
|
||||
InvalidateICacheInternal(translated.address, address, length, forced);
|
||||
return;
|
||||
}
|
||||
|
||||
const u32 end_of_page = (first_address + (1u << shift)) & mask;
|
||||
const u32 length_this_page = end_of_page - first_address;
|
||||
if (translated.valid)
|
||||
InvalidateICacheInternal(translated.address, address, length_this_page, forced);
|
||||
address = address + length_this_page;
|
||||
length = length - length_this_page;
|
||||
}
|
||||
}
|
||||
|
||||
void JitBaseBlockCache::InvalidateICacheInternal(u32 physical_address, u32 address, u32 length,
|
||||
bool forced)
|
||||
{
|
||||
// Optimization for the case of invalidating a single cache line, which is used by the dcb*
|
||||
// instructions. If the valid_block bit for that cacheline is not set, we can safely skip
|
||||
// the remaining invalidation logic.
|
||||
bool destroy_block = true;
|
||||
if (length == 32)
|
||||
if (length == 32 && (physical_address & 0x1fu) == 0)
|
||||
{
|
||||
if (!valid_block.Test(physical_address / 32))
|
||||
destroy_block = false;
|
||||
|
|
|
@ -161,6 +161,7 @@ public:
|
|||
const u8* Dispatch();
|
||||
|
||||
void InvalidateICache(u32 address, u32 length, bool forced);
|
||||
void InvalidateICacheLine(u32 address);
|
||||
void ErasePhysicalRange(u32 address, u32 length);
|
||||
|
||||
u32* GetBlockBitSet() const;
|
||||
|
@ -177,6 +178,7 @@ private:
|
|||
void LinkBlockExits(JitBlock& block);
|
||||
void LinkBlock(JitBlock& block);
|
||||
void UnlinkBlock(const JitBlock& block);
|
||||
void InvalidateICacheInternal(u32 physical_address, u32 address, u32 length, bool forced);
|
||||
|
||||
JitBlock* MoveBlockIntoFastCache(u32 em_address, u32 msr);
|
||||
|
||||
|
|
|
@ -226,7 +226,24 @@ void InvalidateICache(u32 address, u32 size, bool forced)
|
|||
|
||||
void InvalidateICacheLine(u32 address)
|
||||
{
|
||||
InvalidateICache(address & ~0x1f, 32, false);
|
||||
if (g_jit)
|
||||
g_jit->GetBlockCache()->InvalidateICacheLine(address);
|
||||
}
|
||||
|
||||
void InvalidateICacheLines(u32 address, u32 count)
|
||||
{
|
||||
// This corresponds to a PPC code loop that:
|
||||
// - calls some form of dcb* instruction on 'address'
|
||||
// - increments 'address' by the size of a cache line (0x20 bytes)
|
||||
// - decrements 'count' by 1
|
||||
// - jumps back to the dcb* instruction if 'count' != 0
|
||||
// with an extra optimization for the case of a single cache line invalidation
|
||||
if (count == 1)
|
||||
InvalidateICacheLine(address);
|
||||
if (count == 0 || count >= static_cast<u32>(0x1'0000'0000 / 32))
|
||||
InvalidateICache(address & ~0x1f, 0xffffffff, false);
|
||||
else
|
||||
InvalidateICache(address & ~0x1f, 32 * count, false);
|
||||
}
|
||||
|
||||
void CompileExceptionCheck(ExceptionType type)
|
||||
|
|
|
@ -63,6 +63,7 @@ void ClearSafe();
|
|||
// If "forced" is true, a recompile is being requested on code that hasn't been modified.
|
||||
void InvalidateICache(u32 address, u32 size, bool forced);
|
||||
void InvalidateICacheLine(u32 address);
|
||||
void InvalidateICacheLines(u32 address, u32 count);
|
||||
|
||||
void CompileExceptionCheck(ExceptionType type);
|
||||
|
||||
|
|
|
@ -30,10 +30,6 @@
|
|||
|
||||
namespace PowerPC
|
||||
{
|
||||
constexpr size_t HW_PAGE_SIZE = 4096;
|
||||
constexpr u32 HW_PAGE_INDEX_SHIFT = 12;
|
||||
constexpr u32 HW_PAGE_INDEX_MASK = 0x3f;
|
||||
|
||||
// EFB RE
|
||||
/*
|
||||
GXPeekZ
|
||||
|
|
|
@ -222,5 +222,9 @@ inline bool TranslateBatAddess(const BatTable& bat_table, u32* address, bool* wi
|
|||
return true;
|
||||
}
|
||||
|
||||
constexpr size_t HW_PAGE_SIZE = 4096;
|
||||
constexpr u32 HW_PAGE_INDEX_SHIFT = 12;
|
||||
constexpr u32 HW_PAGE_INDEX_MASK = 0x3f;
|
||||
|
||||
std::optional<u32> GetTranslatedAddress(u32 address);
|
||||
} // namespace PowerPC
|
||||
|
|
|
@ -128,7 +128,7 @@ void InstructionCache::Invalidate(u32 addr)
|
|||
}
|
||||
}
|
||||
valid[set] = 0;
|
||||
JitInterface::InvalidateICache(addr & ~0x1f, 32, false);
|
||||
JitInterface::InvalidateICacheLine(addr);
|
||||
}
|
||||
|
||||
u32 InstructionCache::ReadInstruction(u32 addr)
|
||||
|
|
Loading…
Reference in New Issue