Jit: Fix correctness issue in dcbf/dcbi/dcbst
PR #2663 added a Jit64 implementation of dcbX and a fast path to skip JIT cache invalidation. Unfortunately, there is a mismatch between address spaces in this optimization. It tests the effective address (with the top 3 bits cleared) against the valid block bitset which is intended to be indexed by physical address. While this works in the common case, it fails (for example) when the effective address is in the 7E... region (a.k.a. "fake VMEM"). This may also fall apart under more complex memory mapping scenarios requiring full MMU emulation. The good news is that even without this fast path, the underlying call to JitInterface::InvalidateICache() still does very little work in the common case. It correctly translates the effective address to a physical address which it tests against the valid block bitset, skipping invalidation if it is not necessary. As such, the cost of removing the fast path should not be too high. The Jit64 implementation is retained, though all it does now is emit a call. This is marginally more efficient than simple interpreter fallback, which involves an extra call. The JitArm64 implementation has also been fixed. The game Happy Feet is fixed by this change, as it loads code in the 7E... address region and depends upon JIT cache invalidation in reponse to dcbf. https://bugs.dolphin-emu.org/issues/12133
This commit is contained in:
parent
caff472dbf
commit
66b992cfe4
|
@ -233,37 +233,21 @@ void Jit64::dcbx(UGeckoInstruction inst)
|
||||||
JITDISABLE(bJITLoadStoreOff);
|
JITDISABLE(bJITLoadStoreOff);
|
||||||
|
|
||||||
X64Reg addr = RSCRATCH;
|
X64Reg addr = RSCRATCH;
|
||||||
X64Reg value = RSCRATCH2;
|
|
||||||
RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0);
|
RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0);
|
||||||
RCOpArg Rb = gpr.Use(inst.RB, RCMode::Read);
|
RCOpArg Rb = gpr.Use(inst.RB, RCMode::Read);
|
||||||
RCX64Reg tmp = gpr.Scratch();
|
RegCache::Realize(Ra, Rb);
|
||||||
RegCache::Realize(Ra, Rb, tmp);
|
|
||||||
|
|
||||||
MOV_sum(32, addr, Ra, Rb);
|
MOV_sum(32, addr, Ra, Rb);
|
||||||
|
AND(32, R(addr), Imm8(~31));
|
||||||
|
|
||||||
// Check whether a JIT cache line needs to be invalidated.
|
|
||||||
LEA(32, value, MScaled(addr, SCALE_8, 0)); // addr << 3 (masks the first 3 bits)
|
|
||||||
SHR(32, R(value), Imm8(3 + 5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset
|
|
||||||
MOV(64, R(tmp), ImmPtr(GetBlockCache()->GetBlockBitSet()));
|
|
||||||
MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0));
|
|
||||||
SHR(32, R(addr), Imm8(5));
|
|
||||||
BT(32, R(value), R(addr));
|
|
||||||
|
|
||||||
FixupBranch c = J_CC(CC_C, true);
|
|
||||||
SwitchToFarCode();
|
|
||||||
SetJumpTarget(c);
|
|
||||||
BitSet32 registersInUse = CallerSavedRegistersInUse();
|
BitSet32 registersInUse = CallerSavedRegistersInUse();
|
||||||
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
|
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
|
||||||
MOV(32, R(ABI_PARAM1), R(addr));
|
MOV(32, R(ABI_PARAM1), R(addr));
|
||||||
SHL(32, R(ABI_PARAM1), Imm8(5));
|
|
||||||
MOV(32, R(ABI_PARAM2), Imm32(32));
|
MOV(32, R(ABI_PARAM2), Imm32(32));
|
||||||
XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
|
XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
|
||||||
ABI_CallFunction(JitInterface::InvalidateICache);
|
ABI_CallFunction(JitInterface::InvalidateICache);
|
||||||
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
|
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
|
||||||
asm_routines.ResetStack(*this);
|
asm_routines.ResetStack(*this);
|
||||||
c = J(true);
|
|
||||||
SwitchToNearCode();
|
|
||||||
SetJumpTarget(c);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::dcbt(UGeckoInstruction inst)
|
void Jit64::dcbt(UGeckoInstruction inst)
|
||||||
|
|
|
@ -538,11 +538,9 @@ void JitArm64::dcbx(UGeckoInstruction inst)
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITLoadStoreOff);
|
JITDISABLE(bJITLoadStoreOff);
|
||||||
|
|
||||||
gpr.Lock(W30);
|
gpr.Lock(W0);
|
||||||
|
|
||||||
ARM64Reg addr = gpr.GetReg();
|
ARM64Reg addr = W0;
|
||||||
ARM64Reg value = gpr.GetReg();
|
|
||||||
ARM64Reg WA = W30;
|
|
||||||
|
|
||||||
u32 a = inst.RA, b = inst.RB;
|
u32 a = inst.RA, b = inst.RB;
|
||||||
|
|
||||||
|
@ -551,21 +549,7 @@ void JitArm64::dcbx(UGeckoInstruction inst)
|
||||||
else
|
else
|
||||||
MOV(addr, gpr.R(b));
|
MOV(addr, gpr.R(b));
|
||||||
|
|
||||||
// Check whether a JIT cache line needs to be invalidated.
|
ANDI2R(addr, addr, ~31); // mask sizeof cacheline
|
||||||
AND(value, addr, 32 - 10, 28 - 10); // upper three bits and last 10 bit are masked for the bitset
|
|
||||||
// of cachelines, 0x1ffffc00
|
|
||||||
LSR(value, value, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset
|
|
||||||
MOVP2R(EncodeRegTo64(WA), GetBlockCache()->GetBlockBitSet());
|
|
||||||
LDR(value, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(value), true));
|
|
||||||
|
|
||||||
LSR(addr, addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset
|
|
||||||
|
|
||||||
LSR(value, value, addr); // move current bit to bit 0
|
|
||||||
|
|
||||||
FixupBranch bit_not_set = TBZ(value, 0);
|
|
||||||
FixupBranch far_addr = B();
|
|
||||||
SwitchToFarCode();
|
|
||||||
SetJumpTarget(far_addr);
|
|
||||||
|
|
||||||
BitSet32 gprs_to_push = gpr.GetCallerSavedUsed();
|
BitSet32 gprs_to_push = gpr.GetCallerSavedUsed();
|
||||||
BitSet32 fprs_to_push = fpr.GetCallerSavedUsed();
|
BitSet32 fprs_to_push = fpr.GetCallerSavedUsed();
|
||||||
|
@ -573,7 +557,6 @@ void JitArm64::dcbx(UGeckoInstruction inst)
|
||||||
ABI_PushRegisters(gprs_to_push);
|
ABI_PushRegisters(gprs_to_push);
|
||||||
m_float_emit.ABI_PushRegisters(fprs_to_push, X30);
|
m_float_emit.ABI_PushRegisters(fprs_to_push, X30);
|
||||||
|
|
||||||
LSL(W0, addr, 5);
|
|
||||||
MOVI2R(X1, 32);
|
MOVI2R(X1, 32);
|
||||||
MOVI2R(X2, 0);
|
MOVI2R(X2, 0);
|
||||||
MOVP2R(X3, &JitInterface::InvalidateICache);
|
MOVP2R(X3, &JitInterface::InvalidateICache);
|
||||||
|
@ -582,12 +565,7 @@ void JitArm64::dcbx(UGeckoInstruction inst)
|
||||||
m_float_emit.ABI_PopRegisters(fprs_to_push, X30);
|
m_float_emit.ABI_PopRegisters(fprs_to_push, X30);
|
||||||
ABI_PopRegisters(gprs_to_push);
|
ABI_PopRegisters(gprs_to_push);
|
||||||
|
|
||||||
FixupBranch near_addr = B();
|
gpr.Unlock(W0);
|
||||||
SwitchToNearCode();
|
|
||||||
SetJumpTarget(bit_not_set);
|
|
||||||
SetJumpTarget(near_addr);
|
|
||||||
|
|
||||||
gpr.Unlock(addr, value, W30);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitArm64::dcbt(UGeckoInstruction inst)
|
void JitArm64::dcbt(UGeckoInstruction inst)
|
||||||
|
|
|
@ -270,11 +270,6 @@ void JitBaseBlockCache::ErasePhysicalRange(u32 address, u32 length)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
u32* JitBaseBlockCache::GetBlockBitSet() const
|
|
||||||
{
|
|
||||||
return valid_block.m_valid_block.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
void JitBaseBlockCache::WriteDestroyBlock(const JitBlock& block)
|
void JitBaseBlockCache::WriteDestroyBlock(const JitBlock& block)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
|
@ -98,19 +98,6 @@ typedef void (*CompiledCode)();
|
||||||
class ValidBlockBitSet final
|
class ValidBlockBitSet final
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
enum
|
|
||||||
{
|
|
||||||
// ValidBlockBitSet covers the whole 32-bit address-space in 32-byte
|
|
||||||
// chunks.
|
|
||||||
// FIXME: Maybe we can get away with less? There isn't any actual
|
|
||||||
// RAM in most of this space.
|
|
||||||
VALID_BLOCK_MASK_SIZE = (1ULL << 32) / 32,
|
|
||||||
// The number of elements in the allocated array. Each u32 contains 32 bits.
|
|
||||||
VALID_BLOCK_ALLOC_ELEMENTS = VALID_BLOCK_MASK_SIZE / 32
|
|
||||||
};
|
|
||||||
// Directly accessed by Jit64.
|
|
||||||
std::unique_ptr<u32[]> m_valid_block;
|
|
||||||
|
|
||||||
ValidBlockBitSet()
|
ValidBlockBitSet()
|
||||||
{
|
{
|
||||||
m_valid_block.reset(new u32[VALID_BLOCK_ALLOC_ELEMENTS]);
|
m_valid_block.reset(new u32[VALID_BLOCK_ALLOC_ELEMENTS]);
|
||||||
|
@ -121,6 +108,19 @@ public:
|
||||||
void Clear(u32 bit) { m_valid_block[bit / 32] &= ~(1u << (bit % 32)); }
|
void Clear(u32 bit) { m_valid_block[bit / 32] &= ~(1u << (bit % 32)); }
|
||||||
void ClearAll() { memset(m_valid_block.get(), 0, sizeof(u32) * VALID_BLOCK_ALLOC_ELEMENTS); }
|
void ClearAll() { memset(m_valid_block.get(), 0, sizeof(u32) * VALID_BLOCK_ALLOC_ELEMENTS); }
|
||||||
bool Test(u32 bit) { return (m_valid_block[bit / 32] & (1u << (bit % 32))) != 0; }
|
bool Test(u32 bit) { return (m_valid_block[bit / 32] & (1u << (bit % 32))) != 0; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
enum
|
||||||
|
{
|
||||||
|
// ValidBlockBitSet covers the whole 32-bit address-space in 32-byte
|
||||||
|
// chunks.
|
||||||
|
// FIXME: Maybe we can get away with less? There isn't any actual
|
||||||
|
// RAM in most of this space.
|
||||||
|
VALID_BLOCK_MASK_SIZE = (1ULL << 32) / 32,
|
||||||
|
// The number of elements in the allocated array. Each u32 contains 32 bits.
|
||||||
|
VALID_BLOCK_ALLOC_ELEMENTS = VALID_BLOCK_MASK_SIZE / 32
|
||||||
|
};
|
||||||
|
std::unique_ptr<u32[]> m_valid_block;
|
||||||
};
|
};
|
||||||
|
|
||||||
class JitBaseBlockCache
|
class JitBaseBlockCache
|
||||||
|
@ -162,8 +162,6 @@ public:
|
||||||
void InvalidateICache(u32 address, u32 length, bool forced);
|
void InvalidateICache(u32 address, u32 length, bool forced);
|
||||||
void ErasePhysicalRange(u32 address, u32 length);
|
void ErasePhysicalRange(u32 address, u32 length);
|
||||||
|
|
||||||
u32* GetBlockBitSet() const;
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual void DestroyBlock(JitBlock& block);
|
virtual void DestroyBlock(JitBlock& block);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue