From 66b992cfe441fedd7f21827946873bf4c0238b6f Mon Sep 17 00:00:00 2001 From: smurf3tte <75271109+smurf3tte@users.noreply.github.com> Date: Sat, 23 Jan 2021 15:17:09 -0800 Subject: [PATCH] Jit: Fix correctness issue in dcbf/dcbi/dcbst PR #2663 added a Jit64 implementation of dcbX and a fast path to skip JIT cache invalidation. Unfortunately, there is a mismatch between address spaces in this optimization. It tests the effective address (with the top 3 bits cleared) against the valid block bitset which is intended to be indexed by physical address. While this works in the common case, it fails (for example) when the effective address is in the 7E... region (a.k.a. "fake VMEM"). This may also fall apart under more complex memory mapping scenarios requiring full MMU emulation. The good news is that even without this fast path, the underlying call to JitInterface::InvalidateICache() still does very little work in the common case. It correctly translates the effective address to a physical address which it tests against the valid block bitset, skipping invalidation if it is not necessary. As such, the cost of removing the fast path should not be too high. The Jit64 implementation is retained, though all it does now is emit a call. This is marginally more efficient than simple interpreter fallback, which involves an extra call. The JitArm64 implementation has also been fixed. The game Happy Feet is fixed by this change, as it loads code in the 7E... address region and depends upon JIT cache invalidation in reponse to dcbf. https://bugs.dolphin-emu.org/issues/12133 --- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 20 ++----------- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 30 +++---------------- .../Core/Core/PowerPC/JitCommon/JitCache.cpp | 5 ---- Source/Core/Core/PowerPC/JitCommon/JitCache.h | 28 ++++++++--------- 4 files changed, 19 insertions(+), 64 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index acd9dbe70b..71770840a3 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -233,37 +233,21 @@ void Jit64::dcbx(UGeckoInstruction inst) JITDISABLE(bJITLoadStoreOff); X64Reg addr = RSCRATCH; - X64Reg value = RSCRATCH2; RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0); RCOpArg Rb = gpr.Use(inst.RB, RCMode::Read); - RCX64Reg tmp = gpr.Scratch(); - RegCache::Realize(Ra, Rb, tmp); + RegCache::Realize(Ra, Rb); MOV_sum(32, addr, Ra, Rb); + AND(32, R(addr), Imm8(~31)); - // Check whether a JIT cache line needs to be invalidated. - LEA(32, value, MScaled(addr, SCALE_8, 0)); // addr << 3 (masks the first 3 bits) - SHR(32, R(value), Imm8(3 + 5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset - MOV(64, R(tmp), ImmPtr(GetBlockCache()->GetBlockBitSet())); - MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0)); - SHR(32, R(addr), Imm8(5)); - BT(32, R(value), R(addr)); - - FixupBranch c = J_CC(CC_C, true); - SwitchToFarCode(); - SetJumpTarget(c); BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); MOV(32, R(ABI_PARAM1), R(addr)); - SHL(32, R(ABI_PARAM1), Imm8(5)); MOV(32, R(ABI_PARAM2), Imm32(32)); XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); ABI_CallFunction(JitInterface::InvalidateICache); ABI_PopRegistersAndAdjustStack(registersInUse, 0); asm_routines.ResetStack(*this); - c = J(true); - SwitchToNearCode(); - SetJumpTarget(c); } void Jit64::dcbt(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 215db3bf72..2974a0c065 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -538,11 +538,9 @@ void JitArm64::dcbx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); - gpr.Lock(W30); + gpr.Lock(W0); - ARM64Reg addr = gpr.GetReg(); - ARM64Reg value = gpr.GetReg(); - ARM64Reg WA = W30; + ARM64Reg addr = W0; u32 a = inst.RA, b = inst.RB; @@ -551,21 +549,7 @@ void JitArm64::dcbx(UGeckoInstruction inst) else MOV(addr, gpr.R(b)); - // Check whether a JIT cache line needs to be invalidated. - AND(value, addr, 32 - 10, 28 - 10); // upper three bits and last 10 bit are masked for the bitset - // of cachelines, 0x1ffffc00 - LSR(value, value, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset - MOVP2R(EncodeRegTo64(WA), GetBlockCache()->GetBlockBitSet()); - LDR(value, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(value), true)); - - LSR(addr, addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset - - LSR(value, value, addr); // move current bit to bit 0 - - FixupBranch bit_not_set = TBZ(value, 0); - FixupBranch far_addr = B(); - SwitchToFarCode(); - SetJumpTarget(far_addr); + ANDI2R(addr, addr, ~31); // mask sizeof cacheline BitSet32 gprs_to_push = gpr.GetCallerSavedUsed(); BitSet32 fprs_to_push = fpr.GetCallerSavedUsed(); @@ -573,7 +557,6 @@ void JitArm64::dcbx(UGeckoInstruction inst) ABI_PushRegisters(gprs_to_push); m_float_emit.ABI_PushRegisters(fprs_to_push, X30); - LSL(W0, addr, 5); MOVI2R(X1, 32); MOVI2R(X2, 0); MOVP2R(X3, &JitInterface::InvalidateICache); @@ -582,12 +565,7 @@ void JitArm64::dcbx(UGeckoInstruction inst) m_float_emit.ABI_PopRegisters(fprs_to_push, X30); ABI_PopRegisters(gprs_to_push); - FixupBranch near_addr = B(); - SwitchToNearCode(); - SetJumpTarget(bit_not_set); - SetJumpTarget(near_addr); - - gpr.Unlock(addr, value, W30); + gpr.Unlock(W0); } void JitArm64::dcbt(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp index 8bd0b4c725..ed29151e6c 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp @@ -270,11 +270,6 @@ void JitBaseBlockCache::ErasePhysicalRange(u32 address, u32 length) } } -u32* JitBaseBlockCache::GetBlockBitSet() const -{ - return valid_block.m_valid_block.get(); -} - void JitBaseBlockCache::WriteDestroyBlock(const JitBlock& block) { } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index d1020a06b6..678e808f2a 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -98,19 +98,6 @@ typedef void (*CompiledCode)(); class ValidBlockBitSet final { public: - enum - { - // ValidBlockBitSet covers the whole 32-bit address-space in 32-byte - // chunks. - // FIXME: Maybe we can get away with less? There isn't any actual - // RAM in most of this space. - VALID_BLOCK_MASK_SIZE = (1ULL << 32) / 32, - // The number of elements in the allocated array. Each u32 contains 32 bits. - VALID_BLOCK_ALLOC_ELEMENTS = VALID_BLOCK_MASK_SIZE / 32 - }; - // Directly accessed by Jit64. - std::unique_ptr m_valid_block; - ValidBlockBitSet() { m_valid_block.reset(new u32[VALID_BLOCK_ALLOC_ELEMENTS]); @@ -121,6 +108,19 @@ public: void Clear(u32 bit) { m_valid_block[bit / 32] &= ~(1u << (bit % 32)); } void ClearAll() { memset(m_valid_block.get(), 0, sizeof(u32) * VALID_BLOCK_ALLOC_ELEMENTS); } bool Test(u32 bit) { return (m_valid_block[bit / 32] & (1u << (bit % 32))) != 0; } + +private: + enum + { + // ValidBlockBitSet covers the whole 32-bit address-space in 32-byte + // chunks. + // FIXME: Maybe we can get away with less? There isn't any actual + // RAM in most of this space. + VALID_BLOCK_MASK_SIZE = (1ULL << 32) / 32, + // The number of elements in the allocated array. Each u32 contains 32 bits. + VALID_BLOCK_ALLOC_ELEMENTS = VALID_BLOCK_MASK_SIZE / 32 + }; + std::unique_ptr m_valid_block; }; class JitBaseBlockCache @@ -162,8 +162,6 @@ public: void InvalidateICache(u32 address, u32 length, bool forced); void ErasePhysicalRange(u32 address, u32 length); - u32* GetBlockBitSet() const; - protected: virtual void DestroyBlock(JitBlock& block);