diff --git a/Source/Core/Core/Boot/Boot_BS2Emu.cpp b/Source/Core/Core/Boot/Boot_BS2Emu.cpp index 5be2905096..f907b85647 100644 --- a/Source/Core/Core/Boot/Boot_BS2Emu.cpp +++ b/Source/Core/Core/Boot/Boot_BS2Emu.cpp @@ -190,6 +190,12 @@ bool CBoot::RunApploader(bool is_wii, const DiscIO::VolumeDisc& volume, INFO_LOG_FMT(BOOT, "DVDRead: offset: {:08x} memOffset: {:08x} length: {}", dvd_offset, ram_address, length); DVDRead(volume, dvd_offset, ram_address, length, partition); + for (u32 i = 0; i < length; i += 32) + { + if (PowerPC::ppcState.m_enable_dcache) + PowerPC::ppcState.dCache.Invalidate(ram_address + i); + PowerPC::ppcState.iCache.Invalidate(ram_address + i); + } DiscIO::Riivolution::ApplyApploaderMemoryPatches(riivolution_patches, ram_address, length); diff --git a/Source/Core/Core/Config/MainSettings.cpp b/Source/Core/Core/Config/MainSettings.cpp index c88849a795..a6f3a890aa 100644 --- a/Source/Core/Core/Config/MainSettings.cpp +++ b/Source/Core/Core/Config/MainSettings.cpp @@ -37,6 +37,7 @@ const Info MAIN_CPU_CORE{{System::Main, "Core", "CPUCore"}, PowerPC::DefaultCPUCore()}; const Info MAIN_JIT_FOLLOW_BRANCH{{System::Main, "Core", "JITFollowBranch"}, true}; const Info MAIN_FASTMEM{{System::Main, "Core", "Fastmem"}, true}; +const Info MAIN_ACCURATE_CPU_CACHE{{System::Main, "Core", "AccurateCPUCache"}, false}; const Info MAIN_DSP_HLE{{System::Main, "Core", "DSPHLE"}, true}; const Info MAIN_TIMING_VARIANCE{{System::Main, "Core", "TimingVariance"}, 40}; const Info MAIN_CPU_THREAD{{System::Main, "Core", "CPUThread"}, true}; diff --git a/Source/Core/Core/Config/MainSettings.h b/Source/Core/Core/Config/MainSettings.h index 0730681f2d..92b909adf5 100644 --- a/Source/Core/Core/Config/MainSettings.h +++ b/Source/Core/Core/Config/MainSettings.h @@ -55,6 +55,7 @@ extern const Info MAIN_SKIP_IPL; extern const Info MAIN_CPU_CORE; extern const Info MAIN_JIT_FOLLOW_BRANCH; extern const Info MAIN_FASTMEM; +extern const Info MAIN_ACCURATE_CPU_CACHE; // Should really be in the DSP section, but we're kind of stuck with bad decisions made in the past. extern const Info MAIN_DSP_HLE; extern const Info MAIN_TIMING_VARIANCE; diff --git a/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp b/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp index 9c50002204..acea1f5e7f 100644 --- a/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp +++ b/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp @@ -127,6 +127,7 @@ bool IsSettingSaveable(const Config::Location& config_location) &Config::MAIN_CPU_THREAD.GetLocation(), &Config::MAIN_MMU.GetLocation(), &Config::MAIN_PAUSE_ON_PANIC.GetLocation(), + &Config::MAIN_ACCURATE_CPU_CACHE.GetLocation(), &Config::MAIN_BB_DUMP_PORT.GetLocation(), &Config::MAIN_SYNC_GPU.GetLocation(), &Config::MAIN_SYNC_GPU_MAX_DISTANCE.GetLocation(), diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp index 88f9f9997c..401e98e8f1 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp @@ -438,14 +438,17 @@ void Interpreter::dcba(UGeckoInstruction inst) void Interpreter::dcbf(UGeckoInstruction inst) { - // TODO: Implement some sort of L2 emulation. - // TODO: Raise DSI if translation fails (except for direct-store segments). - - // Invalidate the JIT cache here as a heuristic to compensate for - // the lack of precise L1 icache emulation in the JIT. (Portable software - // should use icbi consistently, but games aren't portable.) const u32 address = Helper_Get_EA_X(PowerPC::ppcState, inst); - JitInterface::InvalidateICacheLine(address); + if (!PowerPC::ppcState.m_enable_dcache) + { + // Invalidate the JIT cache here as a heuristic to compensate for + // the lack of precise L1 icache emulation in the JIT. (Portable software + // should use icbi consistently, but games aren't portable.) + JitInterface::InvalidateICacheLine(address); + return; + } + + PowerPC::FlushCacheLine(address); } void Interpreter::dcbi(UGeckoInstruction inst) @@ -456,42 +459,44 @@ void Interpreter::dcbi(UGeckoInstruction inst) return; } - // TODO: Implement some sort of L2 emulation. - // TODO: Raise DSI if translation fails (except for direct-store segments). - - // Invalidate the JIT cache here as a heuristic to compensate for - // the lack of precise L1 icache emulation in the JIT. (Portable software - // should use icbi consistently, but games aren't portable.) const u32 address = Helper_Get_EA_X(PowerPC::ppcState, inst); - JitInterface::InvalidateICacheLine(address); + if (!PowerPC::ppcState.m_enable_dcache) + { + // Invalidate the JIT cache here as a heuristic to compensate for + // the lack of precise L1 icache emulation in the JIT. (Portable software + // should use icbi consistently, but games aren't portable.) + JitInterface::InvalidateICacheLine(address); + return; + } + + PowerPC::InvalidateCacheLine(address); } void Interpreter::dcbst(UGeckoInstruction inst) { - // TODO: Implement some sort of L2 emulation. - // TODO: Raise DSI if translation fails (except for direct-store segments). - - // Invalidate the JIT cache here as a heuristic to compensate for - // the lack of precise L1 icache emulation in the JIT. (Portable software - // should use icbi consistently, but games aren't portable.) const u32 address = Helper_Get_EA_X(PowerPC::ppcState, inst); - JitInterface::InvalidateICacheLine(address); + if (!PowerPC::ppcState.m_enable_dcache) + { + // Invalidate the JIT cache here as a heuristic to compensate for + // the lack of precise L1 icache emulation in the JIT. (Portable software + // should use icbi consistently, but games aren't portable.) + JitInterface::InvalidateICacheLine(address); + return; + } + + PowerPC::StoreCacheLine(address); } +// These instructions hint that it might be optimal to prefetch the specified cache line into the +// data cache. But the CPU is never guaranteed to do this fetch, and in practice it's not more +// performant to emulate it. + void Interpreter::dcbt(UGeckoInstruction inst) { - if (HID0.NOOPTI) - return; - - // TODO: Implement some sort of L2 emulation. } void Interpreter::dcbtst(UGeckoInstruction inst) { - if (HID0.NOOPTI) - return; - - // TODO: Implement some sort of L2 emulation. } void Interpreter::dcbz(UGeckoInstruction inst) @@ -504,14 +509,17 @@ void Interpreter::dcbz(UGeckoInstruction inst) return; } - // Hack to stop dcbz/dcbi over low MEM1 trashing memory. - if ((dcbz_addr < 0x80008000) && (dcbz_addr >= 0x80000000) && - Config::Get(Config::MAIN_LOW_DCBZ_HACK)) + if (!PowerPC::ppcState.m_enable_dcache) { - return; + // Hack to stop dcbz/dcbi over low MEM1 trashing memory. This is not needed if data cache + // emulation is enabled. + if ((dcbz_addr < 0x80008000) && (dcbz_addr >= 0x80000000) && + Config::Get(Config::MAIN_LOW_DCBZ_HACK)) + { + return; + } } - // TODO: Implement some sort of L2 emulation. PowerPC::ClearCacheLine(dcbz_addr & (~31)); } @@ -531,7 +539,6 @@ void Interpreter::dcbz_l(UGeckoInstruction inst) return; } - // FAKE: clear memory instead of clearing the cache block PowerPC::ClearCacheLine(address & (~31)); } @@ -587,6 +594,7 @@ void Interpreter::icbi(UGeckoInstruction inst) { // TODO: Raise DSI if translation fails (except for direct-store segments). const u32 address = Helper_Get_EA_X(PowerPC::ppcState, inst); + JitInterface::InvalidateICacheLine(address); PowerPC::ppcState.iCache.Invalidate(address); } diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp index 8699f05b75..6e01e1ecae 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp @@ -250,9 +250,32 @@ void Interpreter::mfspr(UGeckoInstruction inst) rSPR(index) &= ~1; } break; + case SPR_XER: rSPR(index) = PowerPC::GetXER().Hex; break; + + case SPR_UPMC1: + rSPR(index) = rSPR(SPR_PMC1); + break; + + case SPR_UPMC2: + rSPR(index) = rSPR(SPR_PMC2); + break; + + case SPR_UPMC3: + rSPR(index) = rSPR(SPR_PMC3); + break; + + case SPR_UPMC4: + rSPR(index) = rSPR(SPR_PMC4); + break; + + case SPR_IABR: + // A strange quirk: reading back this register on hardware will always have this bit set to 0 + // (despite the bit appearing to function normally when set). This does not apply to the DABR. + rGPR[inst.RD] = rSPR(index) & ~1; + return; } rGPR[inst.RD] = rSPR(index); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 046a0d6d94..969eb1bf16 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -229,6 +229,8 @@ void Jit64::lXXx(UGeckoInstruction inst) void Jit64::dcbx(UGeckoInstruction inst) { + FALLBACK_IF(m_accurate_cpu_cache_enabled); + INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp index 3629e2deec..d74c1bfbc2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -412,6 +412,11 @@ void Jit64::mfspr(UGeckoInstruction inst) case SPR_PMC2: case SPR_PMC3: case SPR_PMC4: + case SPR_UPMC1: + case SPR_UPMC2: + case SPR_UPMC3: + case SPR_UPMC4: + case SPR_IABR: FALLBACK_IF(true); default: { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 732094e3c1..386c0fb69b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -61,6 +61,9 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, { const u32 access_size = BackPatchInfo::GetFlagSize(flags); + if (m_accurate_cpu_cache_enabled) + mode = MemAccessMode::AlwaysSafe; + const bool emit_fastmem = mode != MemAccessMode::AlwaysSafe; const bool emit_slowmem = mode != MemAccessMode::AlwaysUnsafe; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 6642cec959..63b97fca68 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -625,6 +625,8 @@ void JitArm64::stmw(UGeckoInstruction inst) void JitArm64::dcbx(UGeckoInstruction inst) { + FALLBACK_IF(m_accurate_cpu_cache_enabled); + INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp index cc95654b76..6a626f5aef 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp @@ -395,6 +395,15 @@ void JitArm64::mfspr(UGeckoInstruction inst) break; case SPR_WPAR: case SPR_DEC: + case SPR_PMC1: + case SPR_PMC2: + case SPR_PMC3: + case SPR_PMC4: + case SPR_UPMC1: + case SPR_UPMC2: + case SPR_UPMC3: + case SPR_UPMC4: + case SPR_IABR: FALLBACK_IF(true); default: gpr.BindToRegister(d, false); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp index 679fdea0a8..27bdf3bb13 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp @@ -58,6 +58,13 @@ void JitBase::RefreshConfig() m_fastmem_enabled = Config::Get(Config::MAIN_FASTMEM); m_mmu_enabled = Core::System::GetInstance().IsMMUMode(); m_pause_on_panic_enabled = Core::System::GetInstance().IsPauseOnPanicMode(); + m_accurate_cpu_cache_enabled = Config::Get(Config::MAIN_ACCURATE_CPU_CACHE); + if (m_accurate_cpu_cache_enabled) + { + m_fastmem_enabled = false; + // This hack is unneeded if the data cache is being emulated. + m_low_dcbz_hack = false; + } analyzer.SetDebuggingEnabled(m_enable_debugging); analyzer.SetBranchFollowingEnabled(Config::Get(Config::MAIN_JIT_FOLLOW_BRANCH)); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 99c4d67485..ad218ed8a3 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -136,6 +136,7 @@ protected: bool m_fastmem_enabled = false; bool m_mmu_enabled = false; bool m_pause_on_panic_enabled = false; + bool m_accurate_cpu_cache_enabled = false; void RefreshConfig(); diff --git a/Source/Core/Core/PowerPC/MMU.cpp b/Source/Core/Core/PowerPC/MMU.cpp index 4e44f8c5bf..1ea8397f96 100644 --- a/Source/Core/Core/PowerPC/MMU.cpp +++ b/Source/Core/Core/PowerPC/MMU.cpp @@ -187,6 +187,8 @@ static T ReadFromHardware(Memory::MemoryManager& memory, u32 em_address) return static_cast(var); } + bool wi = false; + if (!never_translate && MSR.DR) { auto translated_addr = TranslateAddress(em_address); @@ -197,6 +199,7 @@ static T ReadFromHardware(Memory::MemoryManager& memory, u32 em_address) return 0; } em_address = translated_addr.address; + wi = translated_addr.wi; } if (flag == XCheckTLBFlag::Read && (em_address & 0xF8000000) == 0x08000000) @@ -221,7 +224,18 @@ static T ReadFromHardware(Memory::MemoryManager& memory, u32 em_address) // Handle RAM; the masking intentionally discards bits (essentially creating // mirrors of memory). T value; - std::memcpy(&value, &memory.GetRAM()[em_address & memory.GetRamMask()], sizeof(T)); + em_address &= memory.GetRamMask(); + + if (!ppcState.m_enable_dcache || wi) + { + std::memcpy(&value, &memory.GetRAM()[em_address], sizeof(T)); + } + else + { + ppcState.dCache.Read(em_address, &value, sizeof(T), + HID0.DLOCK || flag != XCheckTLBFlag::Read); + } + return bswap(value); } @@ -229,7 +243,18 @@ static T ReadFromHardware(Memory::MemoryManager& memory, u32 em_address) (em_address & 0x0FFFFFFF) < memory.GetExRamSizeReal()) { T value; - std::memcpy(&value, &memory.GetEXRAM()[em_address & 0x0FFFFFFF], sizeof(T)); + em_address &= 0x0FFFFFFF; + + if (!ppcState.m_enable_dcache || wi) + { + std::memcpy(&value, &memory.GetEXRAM()[em_address], sizeof(T)); + } + else + { + ppcState.dCache.Read(em_address + 0x10000000, &value, sizeof(T), + HID0.DLOCK || flag != XCheckTLBFlag::Read); + } + return bswap(value); } @@ -391,14 +416,28 @@ static void WriteToHardware(Memory::MemoryManager& memory, u32 em_address, const { // Handle RAM; the masking intentionally discards bits (essentially creating // mirrors of memory). - std::memcpy(&memory.GetRAM()[em_address & memory.GetRamMask()], &swapped_data, size); + em_address &= memory.GetRamMask(); + + if (ppcState.m_enable_dcache && !wi) + ppcState.dCache.Write(em_address, &swapped_data, size, HID0.DLOCK); + + if (!ppcState.m_enable_dcache || wi || flag != XCheckTLBFlag::Write) + std::memcpy(&memory.GetRAM()[em_address], &swapped_data, size); + return; } if (memory.GetEXRAM() && (em_address >> 28) == 0x1 && (em_address & 0x0FFFFFFF) < memory.GetExRamSizeReal()) { - std::memcpy(&memory.GetEXRAM()[em_address & 0x0FFFFFFF], &swapped_data, size); + em_address &= 0x0FFFFFFF; + + if (ppcState.m_enable_dcache && !wi) + ppcState.dCache.Write(em_address + 0x10000000, &swapped_data, size, HID0.DLOCK); + + if (!ppcState.m_enable_dcache || wi || flag != XCheckTLBFlag::Write) + std::memcpy(&memory.GetEXRAM()[em_address], &swapped_data, size); + return; } @@ -1129,6 +1168,100 @@ void ClearCacheLine(u32 address) WriteToHardware(memory, address + i, 0, 4); } +void StoreCacheLine(u32 address) +{ + address &= ~0x1F; + + if (MSR.DR) + { + auto translated_address = TranslateAddress(address); + if (translated_address.result == TranslateAddressResultEnum::DIRECT_STORE_SEGMENT) + { + return; + } + if (translated_address.result == TranslateAddressResultEnum::PAGE_FAULT) + { + // If translation fails, generate a DSI. + GenerateDSIException(address, true); + return; + } + address = translated_address.address; + } + + if (ppcState.m_enable_dcache) + ppcState.dCache.Store(address); +} + +void InvalidateCacheLine(u32 address) +{ + address &= ~0x1F; + + if (MSR.DR) + { + auto translated_address = TranslateAddress(address); + if (translated_address.result == TranslateAddressResultEnum::DIRECT_STORE_SEGMENT) + { + return; + } + if (translated_address.result == TranslateAddressResultEnum::PAGE_FAULT) + { + return; + } + address = translated_address.address; + } + + if (ppcState.m_enable_dcache) + ppcState.dCache.Invalidate(address); +} + +void FlushCacheLine(u32 address) +{ + address &= ~0x1F; + + if (MSR.DR) + { + auto translated_address = TranslateAddress(address); + if (translated_address.result == TranslateAddressResultEnum::DIRECT_STORE_SEGMENT) + { + return; + } + if (translated_address.result == TranslateAddressResultEnum::PAGE_FAULT) + { + // If translation fails, generate a DSI. + GenerateDSIException(address, true); + return; + } + address = translated_address.address; + } + + if (ppcState.m_enable_dcache) + ppcState.dCache.Flush(address); +} + +void TouchCacheLine(u32 address, bool store) +{ + address &= ~0x1F; + + if (MSR.DR) + { + auto translated_address = TranslateAddress(address); + if (translated_address.result == TranslateAddressResultEnum::DIRECT_STORE_SEGMENT) + { + return; + } + if (translated_address.result == TranslateAddressResultEnum::PAGE_FAULT) + { + // If translation fails, generate a DSI. + GenerateDSIException(address, true); + return; + } + address = translated_address.address; + } + + if (ppcState.m_enable_dcache) + ppcState.dCache.Touch(address, store); +} + u32 IsOptimizableMMIOAccess(u32 address, u32 access_size) { if (PowerPC::memchecks.HasAny()) diff --git a/Source/Core/Core/PowerPC/MMU.h b/Source/Core/Core/PowerPC/MMU.h index 6eda9a22b7..bfb855478d 100644 --- a/Source/Core/Core/PowerPC/MMU.h +++ b/Source/Core/Core/PowerPC/MMU.h @@ -164,7 +164,12 @@ void Write_F64(double var, u32 address); void DMA_LCToMemory(u32 mem_address, u32 cache_address, u32 num_blocks); void DMA_MemoryToLC(u32 cache_address, u32 mem_address, u32 num_blocks); + void ClearCacheLine(u32 address); // Zeroes 32 bytes; address should be 32-byte-aligned +void StoreCacheLine(u32 address); +void InvalidateCacheLine(u32 address); +void FlushCacheLine(u32 address); +void TouchCacheLine(u32 address, bool store); // TLB functions void SDRUpdated(); diff --git a/Source/Core/Core/PowerPC/PPCCache.cpp b/Source/Core/Core/PowerPC/PPCCache.cpp index 89b85a60db..acdcfaf0c5 100644 --- a/Source/Core/Core/PowerPC/PPCCache.cpp +++ b/Source/Core/Core/PowerPC/PPCCache.cpp @@ -94,134 +94,270 @@ InstructionCache::~InstructionCache() Config::RemoveConfigChangedCallback(*m_config_callback_id); } -void InstructionCache::Reset() +void Cache::Reset() { valid.fill(0); plru.fill(0); + wrote.fill(0); lookup_table.fill(0xFF); lookup_table_ex.fill(0xFF); lookup_table_vmem.fill(0xFF); +} + +void InstructionCache::Reset() +{ + Cache::Reset(); JitInterface::ClearSafe(); } +void Cache::Init() +{ + data.fill({}); + tags.fill({}); + addrs.fill({}); + Reset(); +} + void InstructionCache::Init() { if (!m_config_callback_id) m_config_callback_id = Config::AddConfigChangedCallback([this] { RefreshConfig(); }); RefreshConfig(); - data.fill({}); - tags.fill({}); - Reset(); + Cache::Init(); } -void InstructionCache::Invalidate(u32 addr) -{ - if (!HID0.ICE || m_disable_icache) - return; - - // Invalidates the whole set - const u32 set = (addr >> 5) & 0x7f; - for (size_t i = 0; i < 8; i++) - { - if (valid[set] & (1U << i)) - { - if (tags[set][i] & (ICACHE_VMEM_BIT >> 12)) - lookup_table_vmem[((tags[set][i] << 7) | set) & 0xfffff] = 0xff; - else if (tags[set][i] & (ICACHE_EXRAM_BIT >> 12)) - lookup_table_ex[((tags[set][i] << 7) | set) & 0x1fffff] = 0xff; - else - lookup_table[((tags[set][i] << 7) | set) & 0xfffff] = 0xff; - } - } - valid[set] = 0; - JitInterface::InvalidateICacheLine(addr); -} - -u32 InstructionCache::ReadInstruction(u32 addr) +void Cache::Store(u32 addr) { auto& system = Core::System::GetInstance(); auto& memory = system.GetMemory(); - if (!HID0.ICE || m_disable_icache) // instruction cache is disabled - return memory.Read_U32(addr); - u32 set = (addr >> 5) & 0x7f; - u32 tag = addr >> 12; + auto [set, way] = GetCache(addr, true); - u32 t; - if (addr & ICACHE_VMEM_BIT) + if (way == 0xff) + return; + + if (valid[set] & (1U << way) && wrote[set] & (1U << way)) + memory.CopyToEmu((addr & ~0x1f), reinterpret_cast(data[set][way].data()), 32); + wrote[set] &= ~(1U << way); +} + +void Cache::FlushAll() +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + for (size_t set = 0; set < CACHE_SETS; set++) { - t = lookup_table_vmem[(addr >> 5) & 0xfffff]; + for (size_t way = 0; way < CACHE_WAYS; way++) + { + if (valid[set] & (1U << way) && wrote[set] & (1U << way)) + memory.CopyToEmu(addrs[set][way], reinterpret_cast(data[set][way].data()), 32); + } } - else if (addr & ICACHE_EXRAM_BIT) + + Reset(); +} + +void Cache::Invalidate(u32 addr) +{ + auto [set, way] = GetCache(addr, true); + + if (way == 0xff) + return; + + if (valid[set] & (1U << way)) { - t = lookup_table_ex[(addr >> 5) & 0x1fffff]; + if (tags[set][way] & (CACHE_VMEM_BIT >> 12)) + lookup_table_vmem[((tags[set][way] << 7) | set) & 0xfffff] = 0xff; + else if (tags[set][way] & (CACHE_EXRAM_BIT >> 12)) + lookup_table_ex[((tags[set][way] << 7) | set) & 0x1fffff] = 0xff; + else + lookup_table[((tags[set][way] << 7) | set) & 0xfffff] = 0xff; + + valid[set] &= ~(1U << way); + wrote[set] &= ~(1U << way); + } +} + +void Cache::Flush(u32 addr) +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + auto [set, way] = GetCache(addr, true); + + if (way == 0xff) + return; + + if (valid[set] & (1U << way)) + { + if (wrote[set] & (1U << way)) + memory.CopyToEmu((addr & ~0x1f), reinterpret_cast(data[set][way].data()), 32); + + if (tags[set][way] & (CACHE_VMEM_BIT >> 12)) + lookup_table_vmem[((tags[set][way] << 7) | set) & 0xfffff] = 0xff; + else if (tags[set][way] & (CACHE_EXRAM_BIT >> 12)) + lookup_table_ex[((tags[set][way] << 7) | set) & 0x1fffff] = 0xff; + else + lookup_table[((tags[set][way] << 7) | set) & 0xfffff] = 0xff; + + valid[set] &= ~(1U << way); + wrote[set] &= ~(1U << way); + } +} + +void Cache::Touch(u32 addr, bool store) +{ + GetCache(addr, false); +} + +std::pair Cache::GetCache(u32 addr, bool locked) +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + addr &= ~31; + u32 set = (addr >> 5) & 0x7f; + u32 way; + + if (addr & CACHE_VMEM_BIT) + { + way = lookup_table_vmem[(addr >> 5) & 0xfffff]; + } + else if (addr & CACHE_EXRAM_BIT) + { + way = lookup_table_ex[(addr >> 5) & 0x1fffff]; } else { - t = lookup_table[(addr >> 5) & 0xfffff]; + way = lookup_table[(addr >> 5) & 0xfffff]; } - if (t == 0xff) // load to the cache + // load to the cache + if (!locked && way == 0xff) { - if (HID0.ILOCK) // instruction cache is locked - return memory.Read_U32(addr); + u32 tag = addr >> 12; + // select a way if (valid[set] != 0xff) - t = s_way_from_valid[valid[set]]; + way = s_way_from_valid[valid[set]]; else - t = s_way_from_plru[plru[set]]; - // load - memory.CopyFromEmu(reinterpret_cast(data[set][t].data()), (addr & ~0x1f), 32); - if (valid[set] & (1 << t)) + way = s_way_from_plru[plru[set]]; + + if (valid[set] & (1 << way)) { - if (tags[set][t] & (ICACHE_VMEM_BIT >> 12)) - lookup_table_vmem[((tags[set][t] << 7) | set) & 0xfffff] = 0xff; - else if (tags[set][t] & (ICACHE_EXRAM_BIT >> 12)) - lookup_table_ex[((tags[set][t] << 7) | set) & 0x1fffff] = 0xff; + // store the cache back to main memory + if (wrote[set] & (1 << way)) + memory.CopyToEmu(addrs[set][way], reinterpret_cast(data[set][way].data()), 32); + + if (tags[set][way] & (CACHE_VMEM_BIT >> 12)) + lookup_table_vmem[((tags[set][way] << 7) | set) & 0xfffff] = 0xff; + else if (tags[set][way] & (CACHE_EXRAM_BIT >> 12)) + lookup_table_ex[((tags[set][way] << 7) | set) & 0x1fffff] = 0xff; else - lookup_table[((tags[set][t] << 7) | set) & 0xfffff] = 0xff; + lookup_table[((tags[set][way] << 7) | set) & 0xfffff] = 0xff; } - if (addr & ICACHE_VMEM_BIT) - lookup_table_vmem[(addr >> 5) & 0xfffff] = t; - else if (addr & ICACHE_EXRAM_BIT) - lookup_table_ex[(addr >> 5) & 0x1fffff] = t; + // load + memory.CopyFromEmu(reinterpret_cast(data[set][way].data()), (addr & ~0x1f), 32); + + if (addr & CACHE_VMEM_BIT) + lookup_table_vmem[(addr >> 5) & 0xfffff] = way; + else if (addr & CACHE_EXRAM_BIT) + lookup_table_ex[(addr >> 5) & 0x1fffff] = way; else - lookup_table[(addr >> 5) & 0xfffff] = t; - tags[set][t] = tag; - valid[set] |= (1 << t); + lookup_table[(addr >> 5) & 0xfffff] = way; + tags[set][way] = tag; + addrs[set][way] = addr; + valid[set] |= (1 << way); + wrote[set] &= ~(1 << way); } + // update plru - plru[set] = (plru[set] & ~s_plru_mask[t]) | s_plru_value[t]; - const u32 res = Common::swap32(data[set][t][(addr >> 2) & 7]); - const u32 inmem = memory.Read_U32(addr); - if (res != inmem) - { - INFO_LOG_FMT(POWERPC, - "ICache read at {:08x} returned stale data: CACHED: {:08x} vs. RAM: {:08x}", addr, - res, inmem); - DolphinAnalytics::Instance().ReportGameQuirk(GameQuirk::ICACHE_MATTERS); - } - return res; + if (way != 0xff) + plru[set] = (plru[set] & ~s_plru_mask[way]) | s_plru_value[way]; + + return {set, way}; } -void InstructionCache::DoState(PointerWrap& p) +void Cache::Read(u32 addr, void* buffer, u32 len, bool locked) +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + auto* value = static_cast(buffer); + + while (len > 0) + { + auto [set, way] = GetCache(addr, locked); + + u32 offset_in_block = addr - (addr & ~31); + u32 len_in_block = std::min(len, ((addr + 32) & ~31) - addr); + + if (way != 0xff) + { + std::memcpy(value, reinterpret_cast(data[set][way].data()) + offset_in_block, + len_in_block); + } + else + { + memory.CopyFromEmu(value, addr, len_in_block); + } + + addr += len_in_block; + len -= len_in_block; + value += len_in_block; + } +} + +void Cache::Write(u32 addr, const void* buffer, u32 len, bool locked) +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + auto* value = static_cast(buffer); + + while (len > 0) + { + auto [set, way] = GetCache(addr, locked); + + u32 offset_in_block = addr - (addr & ~31); + u32 len_in_block = std::min(len, ((addr + 32) & ~31) - addr); + + if (way != 0xff) + { + std::memcpy(reinterpret_cast(data[set][way].data()) + offset_in_block, value, + len_in_block); + wrote[set] |= (1 << way); + } + else + { + memory.CopyToEmu(addr, value, len_in_block); + } + + addr += len_in_block; + len -= len_in_block; + value += len_in_block; + } +} + +void Cache::DoState(PointerWrap& p) { if (p.IsReadMode()) { // Clear valid parts of the lookup tables (this is done instead of using fill(0xff) to avoid // loading the entire 4MB of tables into cache) - for (u32 set = 0; set < ICACHE_SETS; set++) + for (u32 set = 0; set < CACHE_SETS; set++) { - for (u32 way = 0; way < ICACHE_WAYS; way++) + for (u32 way = 0; way < CACHE_WAYS; way++) { if ((valid[set] & (1 << way)) != 0) { const u32 addr = (tags[set][way] << 12) | (set << 5); - if (addr & ICACHE_VMEM_BIT) + if (addr & CACHE_VMEM_BIT) lookup_table_vmem[(addr >> 5) & 0xfffff] = 0xff; - else if (addr & ICACHE_EXRAM_BIT) + else if (addr & CACHE_EXRAM_BIT) lookup_table_ex[(addr >> 5) & 0x1fffff] = 0xff; else lookup_table[(addr >> 5) & 0xfffff] = 0xff; @@ -234,20 +370,22 @@ void InstructionCache::DoState(PointerWrap& p) p.DoArray(tags); p.DoArray(plru); p.DoArray(valid); + p.DoArray(addrs); + p.DoArray(wrote); if (p.IsReadMode()) { // Recompute lookup tables - for (u32 set = 0; set < ICACHE_SETS; set++) + for (u32 set = 0; set < CACHE_SETS; set++) { - for (u32 way = 0; way < ICACHE_WAYS; way++) + for (u32 way = 0; way < CACHE_WAYS; way++) { if ((valid[set] & (1 << way)) != 0) { const u32 addr = (tags[set][way] << 12) | (set << 5); - if (addr & ICACHE_VMEM_BIT) + if (addr & CACHE_VMEM_BIT) lookup_table_vmem[(addr >> 5) & 0xfffff] = way; - else if (addr & ICACHE_EXRAM_BIT) + else if (addr & CACHE_EXRAM_BIT) lookup_table_ex[(addr >> 5) & 0x1fffff] = way; else lookup_table[(addr >> 5) & 0xfffff] = way; @@ -257,6 +395,29 @@ void InstructionCache::DoState(PointerWrap& p) } } +u32 InstructionCache::ReadInstruction(u32 addr) +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + if (!HID0.ICE || m_disable_icache) // instruction cache is disabled + return memory.Read_U32(addr); + + u32 value; + Read(addr, &value, sizeof(value), HID0.ILOCK); + return Common::swap32(value); +} + +void InstructionCache::Invalidate(u32 addr) +{ + if (!HID0.ICE || m_disable_icache) + return; + + Cache::Invalidate(addr); + + JitInterface::InvalidateICacheLine(addr); +} + void InstructionCache::RefreshConfig() { m_disable_icache = Config::Get(Config::MAIN_DISABLE_ICACHE); diff --git a/Source/Core/Core/PowerPC/PPCCache.h b/Source/Core/Core/PowerPC/PPCCache.h index 4b9906ea42..8aa6c4c811 100644 --- a/Source/Core/Core/PowerPC/PPCCache.h +++ b/Source/Core/Core/PowerPC/PPCCache.h @@ -12,20 +12,22 @@ class PointerWrap; namespace PowerPC { -constexpr u32 ICACHE_SETS = 128; -constexpr u32 ICACHE_WAYS = 8; +constexpr u32 CACHE_SETS = 128; +constexpr u32 CACHE_WAYS = 8; // size of an instruction cache block in words -constexpr u32 ICACHE_BLOCK_SIZE = 8; +constexpr u32 CACHE_BLOCK_SIZE = 8; -constexpr u32 ICACHE_EXRAM_BIT = 0x10000000; -constexpr u32 ICACHE_VMEM_BIT = 0x20000000; +constexpr u32 CACHE_EXRAM_BIT = 0x10000000; +constexpr u32 CACHE_VMEM_BIT = 0x20000000; -struct InstructionCache +struct Cache { - std::array, ICACHE_WAYS>, ICACHE_SETS> data{}; - std::array, ICACHE_SETS> tags{}; - std::array plru{}; - std::array valid{}; + std::array, CACHE_WAYS>, CACHE_SETS> data{}; + std::array, CACHE_SETS> tags{}; + std::array plru{}; + std::array valid{}; + std::array, CACHE_SETS> addrs{}; + std::array wrote{}; // Note: This is only for performance purposes; this same data could be computed at runtime // from the tags and valid fields (and that's how it's done on the actual cache) @@ -33,16 +35,36 @@ struct InstructionCache std::array lookup_table_ex{}; std::array lookup_table_vmem{}; - bool m_disable_icache = false; + void Store(u32 addr); + void Invalidate(u32 addr); + void Flush(u32 addr); + void Touch(u32 addr, bool store); + + void FlushAll(); + + std::pair GetCache(u32 addr, bool locked); + + void Read(u32 addr, void* buffer, u32 len, bool locked); + void Write(u32 addr, const void* buffer, u32 len, bool locked); + + void Init(); + void Reset(); + + void DoState(PointerWrap& p); +}; + +struct InstructionCache : public Cache +{ std::optional m_config_callback_id = std::nullopt; + bool m_disable_icache = false; + InstructionCache() = default; ~InstructionCache(); u32 ReadInstruction(u32 addr); void Invalidate(u32 addr); void Init(); void Reset(); - void DoState(PointerWrap& p); void RefreshConfig(); }; } // namespace PowerPC diff --git a/Source/Core/Core/PowerPC/PowerPC.cpp b/Source/Core/Core/PowerPC/PowerPC.cpp index eafa547e92..f05ed37065 100644 --- a/Source/Core/Core/PowerPC/PowerPC.cpp +++ b/Source/Core/Core/PowerPC/PowerPC.cpp @@ -132,9 +132,16 @@ void DoState(PointerWrap& p) p.Do(ppcState.reserve_address); ppcState.iCache.DoState(p); + ppcState.dCache.DoState(p); if (p.IsReadMode()) { + if (!ppcState.m_enable_dcache) + { + INFO_LOG_FMT(POWERPC, "Flushing data cache"); + ppcState.dCache.FlushAll(); + } + RoundingModeUpdated(); IBATUpdated(); DBATUpdated(); @@ -266,6 +273,16 @@ void Init(CPUCore cpu_core) InitializeCPUCore(cpu_core); ppcState.iCache.Init(); + ppcState.dCache.Init(); + + if (Config::Get(Config::MAIN_ACCURATE_CPU_CACHE)) + { + ppcState.m_enable_dcache = true; + } + else + { + ppcState.m_enable_dcache = false; + } if (Config::Get(Config::MAIN_ENABLE_DEBUGGING)) breakpoints.ClearAllTemporary(); @@ -279,6 +296,7 @@ void Reset() ResetRegisters(); ppcState.iCache.Reset(); + ppcState.dCache.Reset(); } void ScheduleInvalidateCacheThreadSafe(u32 address) diff --git a/Source/Core/Core/PowerPC/PowerPC.h b/Source/Core/Core/PowerPC/PowerPC.h index df60432fce..4d70ab7439 100644 --- a/Source/Core/Core/PowerPC/PowerPC.h +++ b/Source/Core/Core/PowerPC/PowerPC.h @@ -172,6 +172,8 @@ struct PowerPCState u32 pagetable_hashmask = 0; InstructionCache iCache; + bool m_enable_dcache = false; + Cache dCache; // Reservation monitor for lwarx and its friend stwcxd. bool reserve; diff --git a/Source/Core/Core/State.cpp b/Source/Core/Core/State.cpp index f5e494e6d1..d0cb1d0d3b 100644 --- a/Source/Core/Core/State.cpp +++ b/Source/Core/Core/State.cpp @@ -95,7 +95,7 @@ static size_t s_state_writes_in_queue; static std::condition_variable s_state_write_queue_is_empty; // Don't forget to increase this after doing changes on the savestate system -constexpr u32 STATE_VERSION = 156; // Last changed in PR 11184 +constexpr u32 STATE_VERSION = 157; // Last changed in PR 11183 // Maps savestate versions to Dolphin versions. // Versions after 42 don't need to be added to this list, @@ -223,14 +223,18 @@ static void DoState(PointerWrap& p) g_video_backend->DoState(p); p.DoMarker("video_backend"); - PowerPC::DoState(p); - p.DoMarker("PowerPC"); // CoreTiming needs to be restored before restoring Hardware because // the controller code might need to schedule an event if the controller has changed. system.GetCoreTiming().DoState(p); p.DoMarker("CoreTiming"); + + // HW needs to be restored before PowerPC because the data cache might need to be flushed. HW::DoState(p); p.DoMarker("HW"); + + PowerPC::DoState(p); + p.DoMarker("PowerPC"); + if (SConfig::GetInstance().bWii) Wiimote::DoState(p); p.DoMarker("Wiimote"); diff --git a/Source/Core/DolphinQt/Settings/AdvancedPane.cpp b/Source/Core/DolphinQt/Settings/AdvancedPane.cpp index f08f9d1570..62e234219d 100644 --- a/Source/Core/DolphinQt/Settings/AdvancedPane.cpp +++ b/Source/Core/DolphinQt/Settings/AdvancedPane.cpp @@ -74,6 +74,12 @@ void AdvancedPane::CreateLayout() "affect performance.\nThe performance impact is the same as having Enable MMU on.")); cpu_options_group_layout->addWidget(m_pause_on_panic_checkbox); + m_accurate_cpu_cache_checkbox = new QCheckBox(tr("Enable Write-Back Cache (slow)")); + m_accurate_cpu_cache_checkbox->setToolTip( + tr("Enables emulation of the CPU write-back cache.\nEnabling will have a significant impact " + "on performance.\nThis should be left disabled unless absolutely needed.")); + cpu_options_group_layout->addWidget(m_accurate_cpu_cache_checkbox); + auto* clock_override = new QGroupBox(tr("Clock Override")); auto* clock_override_layout = new QVBoxLayout(); clock_override->setLayout(clock_override_layout); @@ -189,6 +195,9 @@ void AdvancedPane::ConnectLayout() connect(m_pause_on_panic_checkbox, &QCheckBox::toggled, this, [](bool checked) { Config::SetBaseOrCurrent(Config::MAIN_PAUSE_ON_PANIC, checked); }); + connect(m_accurate_cpu_cache_checkbox, &QCheckBox::toggled, this, + [](bool checked) { Config::SetBaseOrCurrent(Config::MAIN_ACCURATE_CPU_CACHE, checked); }); + m_cpu_clock_override_checkbox->setChecked(Config::Get(Config::MAIN_OVERCLOCK_ENABLE)); connect(m_cpu_clock_override_checkbox, &QCheckBox::toggled, [this](bool enable_clock_override) { Config::SetBaseOrCurrent(Config::MAIN_OVERCLOCK_ENABLE, enable_clock_override); @@ -258,6 +267,9 @@ void AdvancedPane::Update() m_pause_on_panic_checkbox->setChecked(Config::Get(Config::MAIN_PAUSE_ON_PANIC)); m_pause_on_panic_checkbox->setEnabled(!running); + m_accurate_cpu_cache_checkbox->setChecked(Config::Get(Config::MAIN_ACCURATE_CPU_CACHE)); + m_accurate_cpu_cache_checkbox->setEnabled(!running); + QFont bf = font(); bf.setBold(Config::GetActiveLayerForConfig(Config::MAIN_OVERCLOCK_ENABLE) != Config::LayerType::Base); diff --git a/Source/Core/DolphinQt/Settings/AdvancedPane.h b/Source/Core/DolphinQt/Settings/AdvancedPane.h index c74aeacf09..b4fdb141cd 100644 --- a/Source/Core/DolphinQt/Settings/AdvancedPane.h +++ b/Source/Core/DolphinQt/Settings/AdvancedPane.h @@ -33,6 +33,7 @@ private: QComboBox* m_cpu_emulation_engine_combobox; QCheckBox* m_enable_mmu_checkbox; QCheckBox* m_pause_on_panic_checkbox; + QCheckBox* m_accurate_cpu_cache_checkbox; QCheckBox* m_cpu_clock_override_checkbox; QSlider* m_cpu_clock_override_slider; QLabel* m_cpu_clock_override_slider_label;