Merge pull request #9957 from JosJuice/dcbx-faster

Jit: Perform BAT lookup in dcbf/dcbi/dcbst
This commit is contained in:
Léo Lam 2021-07-31 03:27:24 +02:00 committed by GitHub
commit a208ff5aab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 131 additions and 25 deletions

View File

@ -234,14 +234,41 @@ void Jit64::dcbx(UGeckoInstruction inst)
JITDISABLE(bJITLoadStoreOff); JITDISABLE(bJITLoadStoreOff);
X64Reg addr = RSCRATCH; X64Reg addr = RSCRATCH;
X64Reg value = RSCRATCH2;
RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0); RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0);
RCOpArg Rb = gpr.Use(inst.RB, RCMode::Read); RCOpArg Rb = gpr.Use(inst.RB, RCMode::Read);
RegCache::Realize(Ra, Rb); RCX64Reg tmp = gpr.Scratch();
RegCache::Realize(Ra, Rb, tmp);
MOV_sum(32, addr, Ra, Rb); // Translate effective address to physical address.
AND(32, R(addr), Imm8(~31)); MOV_sum(32, value, Ra, Rb);
FixupBranch bat_lookup_failed;
if (MSR.IR)
{
MOV(32, R(addr), R(value));
bat_lookup_failed = BATAddressLookup(value, tmp);
AND(32, R(addr), Imm32(0x0001ffff));
AND(32, R(value), Imm32(0xfffe0000));
OR(32, R(value), R(addr));
}
MOV(32, R(addr), R(value));
// Check whether a JIT cache line needs to be invalidated.
SHR(32, R(value), Imm8(5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset
MOV(64, R(tmp), ImmPtr(GetBlockCache()->GetBlockBitSet()));
MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0));
SHR(32, R(addr), Imm8(5));
BT(32, R(value), R(addr));
FixupBranch invalidate_needed = J_CC(CC_C, true);
SwitchToFarCode();
SetJumpTarget(invalidate_needed);
SHL(32, R(addr), Imm8(5));
if (MSR.IR)
SetJumpTarget(bat_lookup_failed);
BitSet32 registersInUse = CallerSavedRegistersInUse(); BitSet32 registersInUse = CallerSavedRegistersInUse();
registersInUse[X64Reg(tmp)] = false;
ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
MOV(32, R(ABI_PARAM1), R(addr)); MOV(32, R(ABI_PARAM1), R(addr));
MOV(32, R(ABI_PARAM2), Imm32(32)); MOV(32, R(ABI_PARAM2), Imm32(32));
@ -249,6 +276,10 @@ void Jit64::dcbx(UGeckoInstruction inst)
ABI_CallFunction(JitInterface::InvalidateICache); ABI_CallFunction(JitInterface::InvalidateICache);
ABI_PopRegistersAndAdjustStack(registersInUse, 0); ABI_PopRegistersAndAdjustStack(registersInUse, 0);
asm_routines.ResetStack(*this); asm_routines.ResetStack(*this);
FixupBranch done = J(true);
SwitchToNearCode();
SetJumpTarget(done);
} }
void Jit64::dcbt(UGeckoInstruction inst) void Jit64::dcbt(UGeckoInstruction inst)

View File

@ -91,6 +91,16 @@ void EmuCodeBlock::SwitchToNearCode()
SetCodePtr(m_near_code, m_near_code_end, m_near_code_write_failed); SetCodePtr(m_near_code, m_near_code_end, m_near_code_write_failed);
} }
FixupBranch EmuCodeBlock::BATAddressLookup(X64Reg addr, X64Reg tmp)
{
MOV(64, R(tmp), ImmPtr(&PowerPC::dbat_table[0]));
SHR(32, R(addr), Imm8(PowerPC::BAT_INDEX_SHIFT));
MOV(32, R(addr), MComplex(tmp, addr, SCALE_4, 0));
BT(32, R(addr), Imm8(IntLog2(PowerPC::BAT_MAPPED_BIT)));
return J_CC(CC_Z, m_far_code.Enabled());
}
FixupBranch EmuCodeBlock::CheckIfSafeAddress(const OpArg& reg_value, X64Reg reg_addr, FixupBranch EmuCodeBlock::CheckIfSafeAddress(const OpArg& reg_value, X64Reg reg_addr,
BitSet32 registers_in_use) BitSet32 registers_in_use)
{ {

View File

@ -49,6 +49,10 @@ public:
return Gen::M(m_const_pool.GetConstant(&value, sizeof(T), N, index)); return Gen::M(m_const_pool.GetConstant(&value, sizeof(T), N, index));
} }
// Writes upper 15 bits of physical address to addr and clobbers the lower 17 bits of addr.
// Jumps to the returned FixupBranch if lookup fails.
Gen::FixupBranch BATAddressLookup(Gen::X64Reg addr, Gen::X64Reg tmp);
Gen::FixupBranch CheckIfSafeAddress(const Gen::OpArg& reg_value, Gen::X64Reg reg_addr, Gen::FixupBranch CheckIfSafeAddress(const Gen::OpArg& reg_value, Gen::X64Reg reg_addr,
BitSet32 registers_in_use); BitSet32 registers_in_use);
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize,

View File

@ -230,6 +230,10 @@ protected:
// Loadstore routines // Loadstore routines
void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update); void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update);
void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset); void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset);
// If lookup succeeds, writes upper 15 bits of physical address to addr_out. If not,
// jumps to the returned FixupBranch. Clobbers tmp and the 17 lower bits of addr_out.
Arm64Gen::FixupBranch BATAddressLookup(Arm64Gen::ARM64Reg addr_out, Arm64Gen::ARM64Reg addr_in,
Arm64Gen::ARM64Reg tmp);
void DoJit(u32 em_address, JitBlock* b, u32 nextPC); void DoJit(u32 em_address, JitBlock* b, u32 nextPC);

View File

@ -274,6 +274,19 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
} }
FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM64Reg tmp)
{
tmp = EncodeRegTo64(tmp);
MOVP2R(tmp, PowerPC::dbat_table.data());
LSR(addr_out, addr_in, PowerPC::BAT_INDEX_SHIFT);
LDR(addr_out, tmp, ArithOption(addr_out, true));
FixupBranch pass = TBNZ(addr_out, IntLog2(PowerPC::BAT_MAPPED_BIT));
FixupBranch fail = B();
SetJumpTarget(pass);
return fail;
}
void JitArm64::lXX(UGeckoInstruction inst) void JitArm64::lXX(UGeckoInstruction inst)
{ {
INSTRUCTION_START INSTRUCTION_START
@ -539,34 +552,71 @@ void JitArm64::dcbx(UGeckoInstruction inst)
INSTRUCTION_START INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff); JITDISABLE(bJITLoadStoreOff);
gpr.Lock(ARM64Reg::W0); gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
ARM64Reg addr = ARM64Reg::W0; ARM64Reg effective_addr = ARM64Reg::W0;
ARM64Reg physical_addr = MSR.IR ? gpr.GetReg() : effective_addr;
ARM64Reg value = gpr.GetReg();
ARM64Reg WA = ARM64Reg::W30;
u32 a = inst.RA, b = inst.RB; u32 a = inst.RA, b = inst.RB;
if (a) if (a)
ADD(addr, gpr.R(a), gpr.R(b)); ADD(effective_addr, gpr.R(a), gpr.R(b));
else else
MOV(addr, gpr.R(b)); MOV(effective_addr, gpr.R(b));
AND(addr, addr, LogicalImm(~31, 32)); // mask sizeof cacheline // Translate effective address to physical address.
FixupBranch bat_lookup_failed;
if (MSR.IR)
{
bat_lookup_failed = BATAddressLookup(physical_addr, effective_addr, WA);
BFI(physical_addr, effective_addr, 0, PowerPC::BAT_INDEX_SHIFT);
}
// Check whether a JIT cache line needs to be invalidated.
LSR(value, physical_addr, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset
MOVP2R(EncodeRegTo64(WA), GetBlockCache()->GetBlockBitSet());
LDR(value, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(value), true));
LSR(WA, physical_addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset
LSRV(value, value, WA); // move current bit to bit 0
FixupBranch bit_not_set = TBZ(value, 0);
FixupBranch far_addr = B();
SwitchToFarCode();
SetJumpTarget(far_addr);
if (MSR.IR)
SetJumpTarget(bat_lookup_failed);
BitSet32 gprs_to_push = gpr.GetCallerSavedUsed(); BitSet32 gprs_to_push = gpr.GetCallerSavedUsed();
BitSet32 fprs_to_push = fpr.GetCallerSavedUsed(); BitSet32 fprs_to_push = fpr.GetCallerSavedUsed();
gprs_to_push[DecodeReg(effective_addr)] = false;
gprs_to_push[DecodeReg(physical_addr)] = false;
gprs_to_push[DecodeReg(value)] = false;
gprs_to_push[DecodeReg(WA)] = false;
ABI_PushRegisters(gprs_to_push); ABI_PushRegisters(gprs_to_push);
m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30);
MOVI2R(ARM64Reg::X1, 32); MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICache);
MOVI2R(ARM64Reg::X2, 0); // W0 was already set earlier
MOVP2R(ARM64Reg::X3, &JitInterface::InvalidateICache); MOVI2R(ARM64Reg::W1, 32);
BLR(ARM64Reg::X3); MOVI2R(ARM64Reg::W2, 0);
BLR(ARM64Reg::X8);
m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30);
ABI_PopRegisters(gprs_to_push); ABI_PopRegisters(gprs_to_push);
gpr.Unlock(ARM64Reg::W0); FixupBranch near_addr = B();
SwitchToNearCode();
SetJumpTarget(bit_not_set);
SetJumpTarget(near_addr);
gpr.Unlock(effective_addr, value, WA);
if (MSR.IR)
gpr.Unlock(physical_addr);
} }
void JitArm64::dcbt(UGeckoInstruction inst) void JitArm64::dcbt(UGeckoInstruction inst)

View File

@ -269,6 +269,11 @@ void JitBaseBlockCache::ErasePhysicalRange(u32 address, u32 length)
} }
} }
u32* JitBaseBlockCache::GetBlockBitSet() const
{
return valid_block.m_valid_block.get();
}
void JitBaseBlockCache::WriteDestroyBlock(const JitBlock& block) void JitBaseBlockCache::WriteDestroyBlock(const JitBlock& block)
{ {
} }

View File

@ -99,18 +99,6 @@ typedef void (*CompiledCode)();
class ValidBlockBitSet final class ValidBlockBitSet final
{ {
public: public:
ValidBlockBitSet()
{
m_valid_block.reset(new u32[VALID_BLOCK_ALLOC_ELEMENTS]);
ClearAll();
}
void Set(u32 bit) { m_valid_block[bit / 32] |= 1u << (bit % 32); }
void Clear(u32 bit) { m_valid_block[bit / 32] &= ~(1u << (bit % 32)); }
void ClearAll() { memset(m_valid_block.get(), 0, sizeof(u32) * VALID_BLOCK_ALLOC_ELEMENTS); }
bool Test(u32 bit) { return (m_valid_block[bit / 32] & (1u << (bit % 32))) != 0; }
private:
enum enum
{ {
// ValidBlockBitSet covers the whole 32-bit address-space in 32-byte // ValidBlockBitSet covers the whole 32-bit address-space in 32-byte
@ -121,7 +109,19 @@ private:
// The number of elements in the allocated array. Each u32 contains 32 bits. // The number of elements in the allocated array. Each u32 contains 32 bits.
VALID_BLOCK_ALLOC_ELEMENTS = VALID_BLOCK_MASK_SIZE / 32 VALID_BLOCK_ALLOC_ELEMENTS = VALID_BLOCK_MASK_SIZE / 32
}; };
// Directly accessed by Jit64.
std::unique_ptr<u32[]> m_valid_block; std::unique_ptr<u32[]> m_valid_block;
ValidBlockBitSet()
{
m_valid_block.reset(new u32[VALID_BLOCK_ALLOC_ELEMENTS]);
ClearAll();
}
void Set(u32 bit) { m_valid_block[bit / 32] |= 1u << (bit % 32); }
void Clear(u32 bit) { m_valid_block[bit / 32] &= ~(1u << (bit % 32)); }
void ClearAll() { memset(m_valid_block.get(), 0, sizeof(u32) * VALID_BLOCK_ALLOC_ELEMENTS); }
bool Test(u32 bit) const { return (m_valid_block[bit / 32] & (1u << (bit % 32))) != 0; }
}; };
class JitBaseBlockCache class JitBaseBlockCache
@ -163,6 +163,8 @@ public:
void InvalidateICache(u32 address, u32 length, bool forced); void InvalidateICache(u32 address, u32 length, bool forced);
void ErasePhysicalRange(u32 address, u32 length); void ErasePhysicalRange(u32 address, u32 length);
u32* GetBlockBitSet() const;
protected: protected:
virtual void DestroyBlock(JitBlock& block); virtual void DestroyBlock(JitBlock& block);