Jit: Fix correctness issue in dcbf/dcbi/dcbst

PR #2663 added a Jit64 implementation of dcbX and a fast path to skip JIT cache invalidation. Unfortunately, there is a mismatch between address spaces in this optimization. It tests the effective address (with the top 3 bits cleared) against the valid block bitset which is intended to be indexed by physical address. While this works in the common case, it fails (for example) when the effective address is in the 7E... region (a.k.a. "fake VMEM"). This may also fall apart under more complex memory mapping scenarios requiring full MMU emulation.

The good news is that even without this fast path, the underlying call to JitInterface::InvalidateICache() still does very little work in the common case. It correctly translates the effective address to a physical address which it tests against the valid block bitset, skipping invalidation if it is not necessary. As such, the cost of removing the fast path should not be too high.

The Jit64 implementation is retained, though all it does now is emit a call. This is marginally more efficient than simple interpreter fallback, which involves an extra call. The JitArm64 implementation has also been fixed.

The game Happy Feet is fixed by this change, as it loads code in the 7E... address region and depends upon JIT cache invalidation in reponse to dcbf.

https://bugs.dolphin-emu.org/issues/12133
This commit is contained in:
smurf3tte 2021-01-23 15:17:09 -08:00
parent caff472dbf
commit 66b992cfe4
4 changed files with 19 additions and 64 deletions

View File

@ -233,37 +233,21 @@ void Jit64::dcbx(UGeckoInstruction inst)
JITDISABLE(bJITLoadStoreOff);
X64Reg addr = RSCRATCH;
X64Reg value = RSCRATCH2;
RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0);
RCOpArg Rb = gpr.Use(inst.RB, RCMode::Read);
RCX64Reg tmp = gpr.Scratch();
RegCache::Realize(Ra, Rb, tmp);
RegCache::Realize(Ra, Rb);
MOV_sum(32, addr, Ra, Rb);
AND(32, R(addr), Imm8(~31));
// Check whether a JIT cache line needs to be invalidated.
LEA(32, value, MScaled(addr, SCALE_8, 0)); // addr << 3 (masks the first 3 bits)
SHR(32, R(value), Imm8(3 + 5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset
MOV(64, R(tmp), ImmPtr(GetBlockCache()->GetBlockBitSet()));
MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0));
SHR(32, R(addr), Imm8(5));
BT(32, R(value), R(addr));
FixupBranch c = J_CC(CC_C, true);
SwitchToFarCode();
SetJumpTarget(c);
BitSet32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
MOV(32, R(ABI_PARAM1), R(addr));
SHL(32, R(ABI_PARAM1), Imm8(5));
MOV(32, R(ABI_PARAM2), Imm32(32));
XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
ABI_CallFunction(JitInterface::InvalidateICache);
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
asm_routines.ResetStack(*this);
c = J(true);
SwitchToNearCode();
SetJumpTarget(c);
}
void Jit64::dcbt(UGeckoInstruction inst)

View File

@ -538,11 +538,9 @@ void JitArm64::dcbx(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITLoadStoreOff);
gpr.Lock(W30);
gpr.Lock(W0);
ARM64Reg addr = gpr.GetReg();
ARM64Reg value = gpr.GetReg();
ARM64Reg WA = W30;
ARM64Reg addr = W0;
u32 a = inst.RA, b = inst.RB;
@ -551,21 +549,7 @@ void JitArm64::dcbx(UGeckoInstruction inst)
else
MOV(addr, gpr.R(b));
// Check whether a JIT cache line needs to be invalidated.
AND(value, addr, 32 - 10, 28 - 10); // upper three bits and last 10 bit are masked for the bitset
// of cachelines, 0x1ffffc00
LSR(value, value, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset
MOVP2R(EncodeRegTo64(WA), GetBlockCache()->GetBlockBitSet());
LDR(value, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(value), true));
LSR(addr, addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset
LSR(value, value, addr); // move current bit to bit 0
FixupBranch bit_not_set = TBZ(value, 0);
FixupBranch far_addr = B();
SwitchToFarCode();
SetJumpTarget(far_addr);
ANDI2R(addr, addr, ~31); // mask sizeof cacheline
BitSet32 gprs_to_push = gpr.GetCallerSavedUsed();
BitSet32 fprs_to_push = fpr.GetCallerSavedUsed();
@ -573,7 +557,6 @@ void JitArm64::dcbx(UGeckoInstruction inst)
ABI_PushRegisters(gprs_to_push);
m_float_emit.ABI_PushRegisters(fprs_to_push, X30);
LSL(W0, addr, 5);
MOVI2R(X1, 32);
MOVI2R(X2, 0);
MOVP2R(X3, &JitInterface::InvalidateICache);
@ -582,12 +565,7 @@ void JitArm64::dcbx(UGeckoInstruction inst)
m_float_emit.ABI_PopRegisters(fprs_to_push, X30);
ABI_PopRegisters(gprs_to_push);
FixupBranch near_addr = B();
SwitchToNearCode();
SetJumpTarget(bit_not_set);
SetJumpTarget(near_addr);
gpr.Unlock(addr, value, W30);
gpr.Unlock(W0);
}
void JitArm64::dcbt(UGeckoInstruction inst)

View File

@ -270,11 +270,6 @@ void JitBaseBlockCache::ErasePhysicalRange(u32 address, u32 length)
}
}
u32* JitBaseBlockCache::GetBlockBitSet() const
{
return valid_block.m_valid_block.get();
}
void JitBaseBlockCache::WriteDestroyBlock(const JitBlock& block)
{
}

View File

@ -98,19 +98,6 @@ typedef void (*CompiledCode)();
class ValidBlockBitSet final
{
public:
enum
{
// ValidBlockBitSet covers the whole 32-bit address-space in 32-byte
// chunks.
// FIXME: Maybe we can get away with less? There isn't any actual
// RAM in most of this space.
VALID_BLOCK_MASK_SIZE = (1ULL << 32) / 32,
// The number of elements in the allocated array. Each u32 contains 32 bits.
VALID_BLOCK_ALLOC_ELEMENTS = VALID_BLOCK_MASK_SIZE / 32
};
// Directly accessed by Jit64.
std::unique_ptr<u32[]> m_valid_block;
ValidBlockBitSet()
{
m_valid_block.reset(new u32[VALID_BLOCK_ALLOC_ELEMENTS]);
@ -121,6 +108,19 @@ public:
void Clear(u32 bit) { m_valid_block[bit / 32] &= ~(1u << (bit % 32)); }
void ClearAll() { memset(m_valid_block.get(), 0, sizeof(u32) * VALID_BLOCK_ALLOC_ELEMENTS); }
bool Test(u32 bit) { return (m_valid_block[bit / 32] & (1u << (bit % 32))) != 0; }
private:
enum
{
// ValidBlockBitSet covers the whole 32-bit address-space in 32-byte
// chunks.
// FIXME: Maybe we can get away with less? There isn't any actual
// RAM in most of this space.
VALID_BLOCK_MASK_SIZE = (1ULL << 32) / 32,
// The number of elements in the allocated array. Each u32 contains 32 bits.
VALID_BLOCK_ALLOC_ELEMENTS = VALID_BLOCK_MASK_SIZE / 32
};
std::unique_ptr<u32[]> m_valid_block;
};
class JitBaseBlockCache
@ -162,8 +162,6 @@ public:
void InvalidateICache(u32 address, u32 length, bool forced);
void ErasePhysicalRange(u32 address, u32 length);
u32* GetBlockBitSet() const;
protected:
virtual void DestroyBlock(JitBlock& block);