Merge pull request #10007 from AdmiralCurtiss/x64-dcbx-in-loop

Jit64: dcbx loop detection for improved performance when invalidating large memory regions.
This commit is contained in:
JMC47 2021-08-16 21:27:16 -04:00 committed by GitHub
commit d162015112
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 167 additions and 19 deletions

View File

@ -234,20 +234,94 @@ void Jit64::dcbx(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff);
X64Reg addr = RSCRATCH;
X64Reg value = RSCRATCH2;
// Check if the next instructions match a known looping pattern:
// - dcbx rX
// - addi rX,rX,32
// - bdnz+ -8
const bool make_loop = inst.RA == 0 && inst.RB != 0 && CanMergeNextInstructions(2) &&
(js.op[1].inst.hex & 0xfc00'ffff) == 0x38000020 &&
js.op[1].inst.RA_6 == inst.RB && js.op[1].inst.RD_2 == inst.RB &&
js.op[2].inst.hex == 0x4200fff8;
RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0);
RCOpArg Rb = gpr.Use(inst.RB, RCMode::Read);
RCX64Reg Rb = gpr.Bind(inst.RB, make_loop ? RCMode::ReadWrite : RCMode::Read);
RCX64Reg tmp = gpr.Scratch();
RCX64Reg effective_address = gpr.Scratch();
RegCache::Realize(Ra, Rb, tmp, effective_address);
// Translate effective address to physical address.
RCX64Reg loop_counter;
if (make_loop)
{
// We'll execute somewhere between one single cacheline invalidation and however many are needed
// to reduce the downcount to zero, never exceeding the amount requested by the game.
// To stay consistent with the rest of the code we adjust the involved registers (CTR and Rb)
// by the amount of cache lines we invalidate minus one -- since we'll run the regular addi and
// bdnz afterwards! So if we invalidate a single cache line, we don't adjust the registers at
// all, if we invalidate 2 cachelines we adjust the registers by one step, and so on.
RCX64Reg& reg_cycle_count = tmp;
RCX64Reg& reg_downcount = effective_address;
loop_counter = gpr.Scratch();
RegCache::Realize(loop_counter);
// This must be true in order for us to pick up the DIV results and not trash any data.
static_assert(RSCRATCH == Gen::EAX && RSCRATCH2 == Gen::EDX);
// Alright, now figure out how many loops we want to do.
const u8 cycle_count_per_loop =
js.op[0].opinfo->numCycles + js.op[1].opinfo->numCycles + js.op[2].opinfo->numCycles;
// This is both setting the adjusted loop count to 0 for the downcount <= 0 case and clearing
// the upper bits for the DIV instruction in the downcount > 0 case.
XOR(32, R(RSCRATCH2), R(RSCRATCH2));
MOV(32, R(reg_downcount), PPCSTATE(downcount));
TEST(32, R(reg_downcount), R(reg_downcount)); // if (downcount <= 0)
FixupBranch downcount_is_zero_or_negative = J_CC(CC_LE); // only do 1 invalidation; else:
MOV(32, R(loop_counter), PPCSTATE_CTR);
MOV(32, R(RSCRATCH), R(reg_downcount));
MOV(32, R(reg_cycle_count), Imm32(cycle_count_per_loop));
DIV(32, R(reg_cycle_count)); // RSCRATCH = downcount / cycle_count
LEA(32, RSCRATCH2, MDisp(loop_counter, -1)); // RSCRATCH2 = CTR - 1
// ^ Note that this CTR-1 implicitly handles the CTR == 0 case correctly.
CMP(32, R(RSCRATCH), R(RSCRATCH2));
CMOVcc(32, RSCRATCH2, R(RSCRATCH), CC_B); // RSCRATCH2 = min(RSCRATCH, RSCRATCH2)
// RSCRATCH2 now holds the amount of loops to execute minus 1, which is the amount we need to
// adjust downcount, CTR, and Rb by to exit the loop construct with the right values in those
// registers.
SUB(32, R(loop_counter), R(RSCRATCH2));
MOV(32, PPCSTATE_CTR, R(loop_counter)); // CTR -= RSCRATCH2
MOV(32, R(RSCRATCH), R(RSCRATCH2));
IMUL(32, RSCRATCH, R(reg_cycle_count));
// ^ Note that this cannot overflow because it's limited by (downcount/cycle_count).
SUB(32, R(reg_downcount), R(RSCRATCH));
MOV(32, PPCSTATE(downcount), R(reg_downcount)); // downcount -= (RSCRATCH2 * reg_cycle_count)
SetJumpTarget(downcount_is_zero_or_negative);
// Load the loop_counter register with the amount of invalidations to execute.
LEA(32, loop_counter, MDisp(RSCRATCH2, 1));
}
X64Reg value = RSCRATCH;
MOV_sum(32, value, Ra, Rb);
if (make_loop)
{
// This is the best place to adjust Rb to what it should be since RSCRATCH2 still has the
// adjusted loop count and we're done reading from Rb.
SHL(32, R(RSCRATCH2), Imm8(5));
ADD(32, R(Rb), R(RSCRATCH2)); // Rb += (RSCRATCH2 * 32)
}
X64Reg addr = RSCRATCH2;
FixupBranch bat_lookup_failed;
MOV(32, R(effective_address), R(value));
const u8* loop_start = GetCodePtr();
if (MSR.IR)
{
// Translate effective address to physical address.
bat_lookup_failed = BATAddressLookup(value, tmp, PowerPC::ibat_table.data());
MOV(32, R(addr), R(effective_address));
AND(32, R(addr), Imm32(0x0001ffff));
@ -264,6 +338,14 @@ void Jit64::dcbx(UGeckoInstruction inst)
BT(32, R(value), R(addr));
FixupBranch invalidate_needed = J_CC(CC_C, true);
if (make_loop)
{
ADD(32, R(effective_address), Imm8(32));
MOV(32, R(value), R(effective_address));
SUB(32, R(loop_counter), Imm8(1));
J_CC(CC_NZ, loop_start);
}
SwitchToFarCode();
SetJumpTarget(invalidate_needed);
if (MSR.IR)
@ -272,9 +354,20 @@ void Jit64::dcbx(UGeckoInstruction inst)
BitSet32 registersInUse = CallerSavedRegistersInUse();
registersInUse[X64Reg(tmp)] = false;
registersInUse[X64Reg(effective_address)] = false;
if (make_loop)
registersInUse[X64Reg(loop_counter)] = false;
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
MOV(32, R(ABI_PARAM1), R(effective_address));
ABI_CallFunction(JitInterface::InvalidateICacheLine);
if (make_loop)
{
MOV(32, R(ABI_PARAM1), R(effective_address));
MOV(32, R(ABI_PARAM2), R(loop_counter));
ABI_CallFunction(JitInterface::InvalidateICacheLines);
}
else
{
MOV(32, R(ABI_PARAM1), R(effective_address));
ABI_CallFunction(JitInterface::InvalidateICacheLine);
}
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
asm_routines.ResetStack(*this);

View File

@ -183,16 +183,51 @@ const u8* JitBaseBlockCache::Dispatch()
return block->normalEntry;
}
void JitBaseBlockCache::InvalidateICache(u32 address, u32 length, bool forced)
void JitBaseBlockCache::InvalidateICacheLine(u32 address)
{
const auto translated = PowerPC::JitCache_TranslateAddress(address);
if (!translated.valid)
return;
const u32 physical_address = translated.address;
const u32 cache_line_address = address & ~0x1f;
const auto translated = PowerPC::JitCache_TranslateAddress(cache_line_address);
if (translated.valid)
InvalidateICacheInternal(translated.address, cache_line_address, 32, false);
}
// Optimize the common case of length == 32 which is used by Interpreter::dcb*
void JitBaseBlockCache::InvalidateICache(u32 initial_address, u32 initial_length, bool forced)
{
u32 address = initial_address;
u32 length = initial_length;
while (length > 0)
{
const auto translated = PowerPC::JitCache_TranslateAddress(address);
const bool address_from_bat = translated.valid && translated.translated && translated.from_bat;
const int shift = address_from_bat ? PowerPC::BAT_INDEX_SHIFT : PowerPC::HW_PAGE_INDEX_SHIFT;
const u32 mask = ~((1u << shift) - 1u);
const u32 first_address = address;
const u32 last_address = address + (length - 1u);
if ((first_address & mask) == (last_address & mask))
{
if (translated.valid)
InvalidateICacheInternal(translated.address, address, length, forced);
return;
}
const u32 end_of_page = (first_address + (1u << shift)) & mask;
const u32 length_this_page = end_of_page - first_address;
if (translated.valid)
InvalidateICacheInternal(translated.address, address, length_this_page, forced);
address = address + length_this_page;
length = length - length_this_page;
}
}
void JitBaseBlockCache::InvalidateICacheInternal(u32 physical_address, u32 address, u32 length,
bool forced)
{
// Optimization for the case of invalidating a single cache line, which is used by the dcb*
// instructions. If the valid_block bit for that cacheline is not set, we can safely skip
// the remaining invalidation logic.
bool destroy_block = true;
if (length == 32)
if (length == 32 && (physical_address & 0x1fu) == 0)
{
if (!valid_block.Test(physical_address / 32))
destroy_block = false;

View File

@ -161,6 +161,7 @@ public:
const u8* Dispatch();
void InvalidateICache(u32 address, u32 length, bool forced);
void InvalidateICacheLine(u32 address);
void ErasePhysicalRange(u32 address, u32 length);
u32* GetBlockBitSet() const;
@ -177,6 +178,7 @@ private:
void LinkBlockExits(JitBlock& block);
void LinkBlock(JitBlock& block);
void UnlinkBlock(const JitBlock& block);
void InvalidateICacheInternal(u32 physical_address, u32 address, u32 length, bool forced);
JitBlock* MoveBlockIntoFastCache(u32 em_address, u32 msr);

View File

@ -226,7 +226,24 @@ void InvalidateICache(u32 address, u32 size, bool forced)
void InvalidateICacheLine(u32 address)
{
InvalidateICache(address & ~0x1f, 32, false);
if (g_jit)
g_jit->GetBlockCache()->InvalidateICacheLine(address);
}
void InvalidateICacheLines(u32 address, u32 count)
{
// This corresponds to a PPC code loop that:
// - calls some form of dcb* instruction on 'address'
// - increments 'address' by the size of a cache line (0x20 bytes)
// - decrements 'count' by 1
// - jumps back to the dcb* instruction if 'count' != 0
// with an extra optimization for the case of a single cache line invalidation
if (count == 1)
InvalidateICacheLine(address);
if (count == 0 || count >= static_cast<u32>(0x1'0000'0000 / 32))
InvalidateICache(address & ~0x1f, 0xffffffff, false);
else
InvalidateICache(address & ~0x1f, 32 * count, false);
}
void CompileExceptionCheck(ExceptionType type)

View File

@ -63,6 +63,7 @@ void ClearSafe();
// If "forced" is true, a recompile is being requested on code that hasn't been modified.
void InvalidateICache(u32 address, u32 size, bool forced);
void InvalidateICacheLine(u32 address);
void InvalidateICacheLines(u32 address, u32 count);
void CompileExceptionCheck(ExceptionType type);

View File

@ -30,10 +30,6 @@
namespace PowerPC
{
constexpr size_t HW_PAGE_SIZE = 4096;
constexpr u32 HW_PAGE_INDEX_SHIFT = 12;
constexpr u32 HW_PAGE_INDEX_MASK = 0x3f;
// EFB RE
/*
GXPeekZ

View File

@ -222,5 +222,9 @@ inline bool TranslateBatAddess(const BatTable& bat_table, u32* address, bool* wi
return true;
}
constexpr size_t HW_PAGE_SIZE = 4096;
constexpr u32 HW_PAGE_INDEX_SHIFT = 12;
constexpr u32 HW_PAGE_INDEX_MASK = 0x3f;
std::optional<u32> GetTranslatedAddress(u32 address);
} // namespace PowerPC

View File

@ -128,7 +128,7 @@ void InstructionCache::Invalidate(u32 addr)
}
}
valid[set] = 0;
JitInterface::InvalidateICache(addr & ~0x1f, 32, false);
JitInterface::InvalidateICacheLine(addr);
}
u32 InstructionCache::ReadInstruction(u32 addr)