From b84a0704cdf7dd64afa3b27439e7d029baefa648 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 25 Jul 2021 10:58:18 +0200 Subject: [PATCH 1/3] Revert "Jit: Fix correctness issue in dcbf/dcbi/dcbst" This reverts commit 66b992cfe441fedd7f21827946873bf4c0238b6f. A new (additional) correctness issue was revealed in the old AArch64 code when applying it on top of modern JitArm64: LSR was being used when LSRV was intended. This commit uses LSRV. --- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 20 +++++++++++-- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 30 ++++++++++++++++--- .../Core/Core/PowerPC/JitCommon/JitCache.cpp | 5 ++++ Source/Core/Core/PowerPC/JitCommon/JitCache.h | 26 ++++++++-------- 4 files changed, 63 insertions(+), 18 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 6b8052a48c..a0e555770d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -234,21 +234,37 @@ void Jit64::dcbx(UGeckoInstruction inst) JITDISABLE(bJITLoadStoreOff); X64Reg addr = RSCRATCH; + X64Reg value = RSCRATCH2; RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0); RCOpArg Rb = gpr.Use(inst.RB, RCMode::Read); - RegCache::Realize(Ra, Rb); + RCX64Reg tmp = gpr.Scratch(); + RegCache::Realize(Ra, Rb, tmp); MOV_sum(32, addr, Ra, Rb); - AND(32, R(addr), Imm8(~31)); + // Check whether a JIT cache line needs to be invalidated. + LEA(32, value, MScaled(addr, SCALE_8, 0)); // addr << 3 (masks the first 3 bits) + SHR(32, R(value), Imm8(3 + 5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset + MOV(64, R(tmp), ImmPtr(GetBlockCache()->GetBlockBitSet())); + MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0)); + SHR(32, R(addr), Imm8(5)); + BT(32, R(value), R(addr)); + + FixupBranch c = J_CC(CC_C, true); + SwitchToFarCode(); + SetJumpTarget(c); BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); MOV(32, R(ABI_PARAM1), R(addr)); + SHL(32, R(ABI_PARAM1), Imm8(5)); MOV(32, R(ABI_PARAM2), Imm32(32)); XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); ABI_CallFunction(JitInterface::InvalidateICache); ABI_PopRegistersAndAdjustStack(registersInUse, 0); asm_routines.ResetStack(*this); + c = J(true); + SwitchToNearCode(); + SetJumpTarget(c); } void Jit64::dcbt(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index c5e3d7f6fa..0673b7ce22 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -539,9 +539,11 @@ void JitArm64::dcbx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); - gpr.Lock(ARM64Reg::W0); + gpr.Lock(ARM64Reg::W30); - ARM64Reg addr = ARM64Reg::W0; + ARM64Reg addr = gpr.GetReg(); + ARM64Reg value = gpr.GetReg(); + ARM64Reg WA = ARM64Reg::W30; u32 a = inst.RA, b = inst.RB; @@ -550,7 +552,21 @@ void JitArm64::dcbx(UGeckoInstruction inst) else MOV(addr, gpr.R(b)); - AND(addr, addr, LogicalImm(~31, 32)); // mask sizeof cacheline + // Check whether a JIT cache line needs to be invalidated. + AND(value, addr, LogicalImm(0x1ffffc00, 32)); // upper three bits and last 10 bit are masked for + // the bitset of cachelines, 0x1ffffc00 + LSR(value, value, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset + MOVP2R(EncodeRegTo64(WA), GetBlockCache()->GetBlockBitSet()); + LDR(value, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(value), true)); + + LSR(addr, addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset + + LSRV(value, value, addr); // move current bit to bit 0 + + FixupBranch bit_not_set = TBZ(value, 0); + FixupBranch far_addr = B(); + SwitchToFarCode(); + SetJumpTarget(far_addr); BitSet32 gprs_to_push = gpr.GetCallerSavedUsed(); BitSet32 fprs_to_push = fpr.GetCallerSavedUsed(); @@ -558,6 +574,7 @@ void JitArm64::dcbx(UGeckoInstruction inst) ABI_PushRegisters(gprs_to_push); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); + LSL(ARM64Reg::W0, addr, 5); MOVI2R(ARM64Reg::X1, 32); MOVI2R(ARM64Reg::X2, 0); MOVP2R(ARM64Reg::X3, &JitInterface::InvalidateICache); @@ -566,7 +583,12 @@ void JitArm64::dcbx(UGeckoInstruction inst) m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); ABI_PopRegisters(gprs_to_push); - gpr.Unlock(ARM64Reg::W0); + FixupBranch near_addr = B(); + SwitchToNearCode(); + SetJumpTarget(bit_not_set); + SetJumpTarget(near_addr); + + gpr.Unlock(addr, value, ARM64Reg::W30); } void JitArm64::dcbt(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp index 0a6e9bf07c..898a87b0ef 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp @@ -269,6 +269,11 @@ void JitBaseBlockCache::ErasePhysicalRange(u32 address, u32 length) } } +u32* JitBaseBlockCache::GetBlockBitSet() const +{ + return valid_block.m_valid_block.get(); +} + void JitBaseBlockCache::WriteDestroyBlock(const JitBlock& block) { } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index bf8de09f9b..35956e471e 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -99,18 +99,6 @@ typedef void (*CompiledCode)(); class ValidBlockBitSet final { public: - ValidBlockBitSet() - { - m_valid_block.reset(new u32[VALID_BLOCK_ALLOC_ELEMENTS]); - ClearAll(); - } - - void Set(u32 bit) { m_valid_block[bit / 32] |= 1u << (bit % 32); } - void Clear(u32 bit) { m_valid_block[bit / 32] &= ~(1u << (bit % 32)); } - void ClearAll() { memset(m_valid_block.get(), 0, sizeof(u32) * VALID_BLOCK_ALLOC_ELEMENTS); } - bool Test(u32 bit) { return (m_valid_block[bit / 32] & (1u << (bit % 32))) != 0; } - -private: enum { // ValidBlockBitSet covers the whole 32-bit address-space in 32-byte @@ -121,7 +109,19 @@ private: // The number of elements in the allocated array. Each u32 contains 32 bits. VALID_BLOCK_ALLOC_ELEMENTS = VALID_BLOCK_MASK_SIZE / 32 }; + // Directly accessed by Jit64. std::unique_ptr m_valid_block; + + ValidBlockBitSet() + { + m_valid_block.reset(new u32[VALID_BLOCK_ALLOC_ELEMENTS]); + ClearAll(); + } + + void Set(u32 bit) { m_valid_block[bit / 32] |= 1u << (bit % 32); } + void Clear(u32 bit) { m_valid_block[bit / 32] &= ~(1u << (bit % 32)); } + void ClearAll() { memset(m_valid_block.get(), 0, sizeof(u32) * VALID_BLOCK_ALLOC_ELEMENTS); } + bool Test(u32 bit) { return (m_valid_block[bit / 32] & (1u << (bit % 32))) != 0; } }; class JitBaseBlockCache @@ -163,6 +163,8 @@ public: void InvalidateICache(u32 address, u32 length, bool forced); void ErasePhysicalRange(u32 address, u32 length); + u32* GetBlockBitSet() const; + protected: virtual void DestroyBlock(JitBlock& block); From 92d1d60ff1d0a03146cdc9cee2c289441037692d Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 25 Jul 2021 13:24:30 +0200 Subject: [PATCH 2/3] Jit: Perform BAT lookup in dcbf/dcbi/dcbst When 66b992c fixed https://bugs.dolphin-emu.org/issues/12133, it did so by removing the broken address calculation entirely and always using the slow path. This caused a performance regression, https://bugs.dolphin-emu.org/issues/12477. This commit instead replaces the broken address calculation with a BAT lookup. If the BAT lookup succeeds, we can use the old fast path. Otherwise we use the slow path. Intends to improve https://bugs.dolphin-emu.org/issues/12477. --- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 31 +++++++--- .../Core/PowerPC/Jit64Common/EmuCodeBlock.cpp | 10 ++++ .../Core/PowerPC/Jit64Common/EmuCodeBlock.h | 4 ++ Source/Core/Core/PowerPC/JitArm64/Jit.h | 4 ++ .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 58 ++++++++++++++----- 5 files changed, 84 insertions(+), 23 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index a0e555770d..93b38710b4 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -240,31 +240,46 @@ void Jit64::dcbx(UGeckoInstruction inst) RCX64Reg tmp = gpr.Scratch(); RegCache::Realize(Ra, Rb, tmp); - MOV_sum(32, addr, Ra, Rb); + // Translate effective address to physical address. + MOV_sum(32, value, Ra, Rb); + FixupBranch bat_lookup_failed; + if (MSR.IR) + { + MOV(32, R(addr), R(value)); + bat_lookup_failed = BATAddressLookup(value, tmp); + AND(32, R(addr), Imm32(0x0001ffff)); + AND(32, R(value), Imm32(0xfffe0000)); + OR(32, R(value), R(addr)); + } + MOV(32, R(addr), R(value)); // Check whether a JIT cache line needs to be invalidated. - LEA(32, value, MScaled(addr, SCALE_8, 0)); // addr << 3 (masks the first 3 bits) - SHR(32, R(value), Imm8(3 + 5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset + SHR(32, R(value), Imm8(5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset MOV(64, R(tmp), ImmPtr(GetBlockCache()->GetBlockBitSet())); MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0)); SHR(32, R(addr), Imm8(5)); BT(32, R(value), R(addr)); + FixupBranch invalidate_needed = J_CC(CC_C, true); - FixupBranch c = J_CC(CC_C, true); SwitchToFarCode(); - SetJumpTarget(c); + SetJumpTarget(invalidate_needed); + SHL(32, R(addr), Imm8(5)); + if (MSR.IR) + SetJumpTarget(bat_lookup_failed); + BitSet32 registersInUse = CallerSavedRegistersInUse(); + registersInUse[X64Reg(tmp)] = false; ABI_PushRegistersAndAdjustStack(registersInUse, 0); MOV(32, R(ABI_PARAM1), R(addr)); - SHL(32, R(ABI_PARAM1), Imm8(5)); MOV(32, R(ABI_PARAM2), Imm32(32)); XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); ABI_CallFunction(JitInterface::InvalidateICache); ABI_PopRegistersAndAdjustStack(registersInUse, 0); asm_routines.ResetStack(*this); - c = J(true); + + FixupBranch done = J(true); SwitchToNearCode(); - SetJumpTarget(c); + SetJumpTarget(done); } void Jit64::dcbt(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp index 398602e685..626d029f30 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp @@ -91,6 +91,16 @@ void EmuCodeBlock::SwitchToNearCode() SetCodePtr(m_near_code, m_near_code_end, m_near_code_write_failed); } +FixupBranch EmuCodeBlock::BATAddressLookup(X64Reg addr, X64Reg tmp) +{ + MOV(64, R(tmp), ImmPtr(&PowerPC::dbat_table[0])); + SHR(32, R(addr), Imm8(PowerPC::BAT_INDEX_SHIFT)); + MOV(32, R(addr), MComplex(tmp, addr, SCALE_4, 0)); + BT(32, R(addr), Imm8(IntLog2(PowerPC::BAT_MAPPED_BIT))); + + return J_CC(CC_Z, m_far_code.Enabled()); +} + FixupBranch EmuCodeBlock::CheckIfSafeAddress(const OpArg& reg_value, X64Reg reg_addr, BitSet32 registers_in_use) { diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h index 916e6c906d..20cc36570d 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h @@ -49,6 +49,10 @@ public: return Gen::M(m_const_pool.GetConstant(&value, sizeof(T), N, index)); } + // Writes upper 15 bits of physical address to addr and clobbers the lower 17 bits of addr. + // Jumps to the returned FixupBranch if lookup fails. + Gen::FixupBranch BATAddressLookup(Gen::X64Reg addr, Gen::X64Reg tmp); + Gen::FixupBranch CheckIfSafeAddress(const Gen::OpArg& reg_value, Gen::X64Reg reg_addr, BitSet32 registers_in_use); void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index f4ca4c0485..b7daab4327 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -229,6 +229,10 @@ protected: // Loadstore routines void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update); void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset); + // If lookup succeeds, writes upper 15 bits of physical address to addr_out. If not, + // jumps to the returned FixupBranch. Clobbers tmp and the 17 lower bits of addr_out. + Arm64Gen::FixupBranch BATAddressLookup(Arm64Gen::ARM64Reg addr_out, Arm64Gen::ARM64Reg addr_in, + Arm64Gen::ARM64Reg tmp); void DoJit(u32 em_address, JitBlock* b, u32 nextPC); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 0673b7ce22..2974aeef30 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -274,6 +274,19 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); } +FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM64Reg tmp) +{ + tmp = EncodeRegTo64(tmp); + + MOVP2R(tmp, PowerPC::dbat_table.data()); + LSR(addr_out, addr_in, PowerPC::BAT_INDEX_SHIFT); + LDR(addr_out, tmp, ArithOption(addr_out, true)); + FixupBranch pass = TBNZ(addr_out, IntLog2(PowerPC::BAT_MAPPED_BIT)); + FixupBranch fail = B(); + SetJumpTarget(pass); + return fail; +} + void JitArm64::lXX(UGeckoInstruction inst) { INSTRUCTION_START @@ -539,46 +552,59 @@ void JitArm64::dcbx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); - gpr.Lock(ARM64Reg::W30); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); - ARM64Reg addr = gpr.GetReg(); + ARM64Reg effective_addr = ARM64Reg::W0; + ARM64Reg physical_addr = MSR.IR ? gpr.GetReg() : effective_addr; ARM64Reg value = gpr.GetReg(); ARM64Reg WA = ARM64Reg::W30; u32 a = inst.RA, b = inst.RB; if (a) - ADD(addr, gpr.R(a), gpr.R(b)); + ADD(effective_addr, gpr.R(a), gpr.R(b)); else - MOV(addr, gpr.R(b)); + MOV(effective_addr, gpr.R(b)); + + // Translate effective address to physical address. + FixupBranch bat_lookup_failed; + if (MSR.IR) + { + bat_lookup_failed = BATAddressLookup(physical_addr, effective_addr, WA); + BFI(physical_addr, effective_addr, 0, PowerPC::BAT_INDEX_SHIFT); + } // Check whether a JIT cache line needs to be invalidated. - AND(value, addr, LogicalImm(0x1ffffc00, 32)); // upper three bits and last 10 bit are masked for - // the bitset of cachelines, 0x1ffffc00 - LSR(value, value, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset + LSR(value, physical_addr, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset MOVP2R(EncodeRegTo64(WA), GetBlockCache()->GetBlockBitSet()); LDR(value, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(value), true)); - LSR(addr, addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset + LSR(WA, physical_addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset - LSRV(value, value, addr); // move current bit to bit 0 + LSRV(value, value, WA); // move current bit to bit 0 FixupBranch bit_not_set = TBZ(value, 0); FixupBranch far_addr = B(); SwitchToFarCode(); SetJumpTarget(far_addr); + if (MSR.IR) + SetJumpTarget(bat_lookup_failed); BitSet32 gprs_to_push = gpr.GetCallerSavedUsed(); BitSet32 fprs_to_push = fpr.GetCallerSavedUsed(); + gprs_to_push[DecodeReg(effective_addr)] = false; + gprs_to_push[DecodeReg(physical_addr)] = false; + gprs_to_push[DecodeReg(value)] = false; + gprs_to_push[DecodeReg(WA)] = false; ABI_PushRegisters(gprs_to_push); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); - LSL(ARM64Reg::W0, addr, 5); - MOVI2R(ARM64Reg::X1, 32); - MOVI2R(ARM64Reg::X2, 0); - MOVP2R(ARM64Reg::X3, &JitInterface::InvalidateICache); - BLR(ARM64Reg::X3); + MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICache); + // W0 was already set earlier + MOVI2R(ARM64Reg::W1, 32); + MOVI2R(ARM64Reg::W2, 0); + BLR(ARM64Reg::X8); m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); ABI_PopRegisters(gprs_to_push); @@ -588,7 +614,9 @@ void JitArm64::dcbx(UGeckoInstruction inst) SetJumpTarget(bit_not_set); SetJumpTarget(near_addr); - gpr.Unlock(addr, value, ARM64Reg::W30); + gpr.Unlock(effective_addr, value, WA); + if (MSR.IR) + gpr.Unlock(physical_addr); } void JitArm64::dcbt(UGeckoInstruction inst) From ca55d599e802e28349793f0098b1132df86149a9 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Tue, 27 Jul 2021 11:11:30 +0200 Subject: [PATCH 3/3] Jit: Mark ValidBlockBitSet::Test as const --- Source/Core/Core/PowerPC/JitCommon/JitCache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index 35956e471e..ec8e9951df 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -121,7 +121,7 @@ public: void Set(u32 bit) { m_valid_block[bit / 32] |= 1u << (bit % 32); } void Clear(u32 bit) { m_valid_block[bit / 32] &= ~(1u << (bit % 32)); } void ClearAll() { memset(m_valid_block.get(), 0, sizeof(u32) * VALID_BLOCK_ALLOC_ELEMENTS); } - bool Test(u32 bit) { return (m_valid_block[bit / 32] & (1u << (bit % 32))) != 0; } + bool Test(u32 bit) const { return (m_valid_block[bit / 32] & (1u << (bit % 32))) != 0; } }; class JitBaseBlockCache