From 5e45330703c72e7595d177c187e5e1517b42c709 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 6 Sep 2020 21:07:09 +1000 Subject: [PATCH] WIP fastmem --- src/common/page_fault_handler.cpp | 1 + src/common/page_fault_handler.h | 3 +- src/core/bus.cpp | 214 +++++++++- src/core/bus.h | 58 ++- src/core/cpu_code_cache.cpp | 194 ++++++++- src/core/cpu_code_cache.h | 16 +- src/core/cpu_core.cpp | 6 + src/core/cpu_core.h | 2 + src/core/cpu_recompiler_code_generator.cpp | 23 +- src/core/cpu_recompiler_code_generator.h | 11 +- .../cpu_recompiler_code_generator_aarch64.cpp | 283 ++++++++++--- .../cpu_recompiler_code_generator_x64.cpp | 397 +++++++++++++++--- src/core/cpu_recompiler_thunks.h | 1 + src/core/cpu_recompiler_types.h | 10 + src/core/host_interface.cpp | 12 +- src/core/settings.cpp | 2 + src/core/settings.h | 6 + src/core/system.cpp | 8 +- src/core/types.h | 2 +- 19 files changed, 1070 insertions(+), 179 deletions(-) diff --git a/src/common/page_fault_handler.cpp b/src/common/page_fault_handler.cpp index 67d3192b2..448783475 100644 --- a/src/common/page_fault_handler.cpp +++ b/src/common/page_fault_handler.cpp @@ -3,6 +3,7 @@ #include #include #include +#include Log_SetChannel(Common::PageFaultHandler); #if defined(WIN32) diff --git a/src/common/page_fault_handler.h b/src/common/page_fault_handler.h index b2c4f9040..67ef38cbd 100644 --- a/src/common/page_fault_handler.h +++ b/src/common/page_fault_handler.h @@ -1,6 +1,5 @@ #pragma once #include "types.h" -#include namespace Common::PageFaultHandler { enum class HandlerResult @@ -9,7 +8,7 @@ enum class HandlerResult ExecuteNextHandler, }; -using Callback = std::function; +using Callback = HandlerResult(*)(void* exception_pc, void* fault_address, bool is_write); using Handle = void*; bool InstallHandler(void* owner, Callback callback); diff --git a/src/core/bus.cpp b/src/core/bus.cpp index f238e6b17..340bca2c5 100644 --- a/src/core/bus.cpp +++ b/src/core/bus.cpp @@ -10,6 +10,7 @@ #include "cpu_disasm.h" #include "dma.h" #include "gpu.h" +#include "host_interface.h" #include "interrupt_controller.h" #include "mdec.h" #include "pad.h" @@ -22,11 +23,6 @@ Log_SetChannel(Bus); namespace Bus { -enum : TickCount -{ - RAM_READ_TICKS = 4 -}; - union MEMDELAY { u32 bits; @@ -74,8 +70,8 @@ union MEMCTRL }; std::bitset m_ram_code_bits{}; -u8 g_ram[RAM_SIZE]{}; // 2MB RAM -u8 g_bios[BIOS_SIZE]{}; // 512K BIOS ROM +u8* g_ram = nullptr; // 2MB RAM +u8* g_bios = nullptr; // 512K BIOS ROM static std::array m_exp1_access_time = {}; static std::array m_exp2_access_time = {}; @@ -90,9 +86,15 @@ static u32 m_ram_size_reg = 0; static std::string m_tty_line_buffer; +static Common::MemoryArena m_memory_arena; +static u8* m_fastmem_base = nullptr; +static std::vector m_fastmem_ram_views; + static std::tuple CalculateMemoryTiming(MEMDELAY mem_delay, COMDELAY common_delay); static void RecalculateMemoryTimings(); +static void SetCodePageFastmemProtection(u32 page_index, bool writable); + #define FIXUP_WORD_READ_OFFSET(offset) ((offset) & ~u32(3)) #define FIXUP_WORD_READ_VALUE(offset, value) ((value) >> (((offset)&u32(3)) * 8u)) #define FIXUP_HALFWORD_READ_OFFSET(offset) ((offset) & ~u32(1)) @@ -108,19 +110,32 @@ ALWAYS_INLINE static void FixupUnalignedWordAccessW32(u32& offset, u32& value) value <<= byte_offset * 8; } -void Initialize() +bool Initialize() { + if (!AllocateMemory()) + { + g_host_interface->ReportError("Failed to allocate memory"); + return false; + } + Reset(); + return true; } void Shutdown() { - // + m_fastmem_ram_views.clear(); + if (g_ram) + m_memory_arena.ReleaseViewPtr(g_ram, RAM_SIZE); + if (g_bios) + m_memory_arena.ReleaseViewPtr(g_bios, BIOS_SIZE); + + CPU::g_state.fastmem_base = nullptr; } void Reset() { - std::memset(g_ram, 0, sizeof(g_ram)); + std::memset(g_ram, 0, RAM_SIZE); m_MEMCTRL.exp1_base = 0x1F000000; m_MEMCTRL.exp2_base = 0x1F802000; m_MEMCTRL.exp1_delay_size.bits = 0x0013243F; @@ -142,8 +157,8 @@ bool DoState(StateWrapper& sw) sw.Do(&m_bios_access_time); sw.Do(&m_cdrom_access_time); sw.Do(&m_spu_access_time); - sw.DoBytes(g_ram, sizeof(g_ram)); - sw.DoBytes(g_bios, sizeof(g_bios)); + sw.DoBytes(g_ram, RAM_SIZE); + sw.DoBytes(g_bios, BIOS_SIZE); sw.DoArray(m_MEMCTRL.regs, countof(m_MEMCTRL.regs)); sw.Do(&m_ram_size_reg); sw.Do(&m_tty_line_buffer); @@ -222,6 +237,181 @@ void RecalculateMemoryTimings() m_spu_access_time[2] + 1); } +bool AllocateMemory() +{ + if (!m_memory_arena.Create(MEMORY_ARENA_SIZE, true, false)) + { + Log_ErrorPrint("Failed to create memory arena"); + return false; + } + + // Create the base views. + g_ram = static_cast(m_memory_arena.CreateViewPtr(MEMORY_ARENA_RAM_OFFSET, RAM_SIZE, true, false)); + g_bios = static_cast(m_memory_arena.CreateViewPtr(MEMORY_ARENA_BIOS_OFFSET, BIOS_SIZE, true, false)); + if (!g_ram || !g_bios) + { + Log_ErrorPrint("Failed to create base views of memory"); + return false; + } + + return true; +} + +void UpdateFastmemViews(bool enabled, bool isolate_cache) +{ + m_fastmem_ram_views.clear(); + if (!enabled) + { + m_fastmem_base = nullptr; + return; + } + + Log_DevPrintf("Remapping fastmem area, isolate cache = %s", isolate_cache ? "true " : "false"); + if (!m_fastmem_base) + { + m_fastmem_base = static_cast(m_memory_arena.FindBaseAddressForMapping(FASTMEM_REGION_SIZE)); + if (!m_fastmem_base) + { + Log_ErrorPrint("Failed to find base address for fastmem"); + return; + } + + Log_InfoPrintf("Fastmem base: %p", m_fastmem_base); + CPU::g_state.fastmem_base = m_fastmem_base; + } + + auto MapRAM = [](u32 base_address) { + u8* map_address = m_fastmem_base + base_address; + auto view = m_memory_arena.CreateView(MEMORY_ARENA_RAM_OFFSET, RAM_SIZE, true, false, map_address); + if (!view) + { + Log_ErrorPrintf("Failed to map RAM at fastmem area %p (offset 0x%08X)", map_address, RAM_SIZE); + return; + } + + // mark all pages with code as non-writable + for (u32 i = 0; i < CPU_CODE_CACHE_PAGE_COUNT; i++) + { + if (m_ram_code_bits[i]) + { + u8* page_address = map_address + (i * CPU_CODE_CACHE_PAGE_SIZE); + if (!m_memory_arena.SetPageProtection(page_address, CPU_CODE_CACHE_PAGE_SIZE, true, false, false)) + { + Log_ErrorPrintf("Failed to write-protect code page at %p"); + return; + } + } + } + + m_fastmem_ram_views.push_back(std::move(view.value())); + }; + auto MapBIOS = [](u32 base_address) { + u8* map_address = m_fastmem_base + base_address; + auto view = m_memory_arena.CreateView(MEMORY_ARENA_BIOS_OFFSET, BIOS_SIZE, false, false, map_address); + if (!view) + { + Log_ErrorPrintf("Failed to map BIOS at fastmem area %p (offset 0x%08X)", map_address, RAM_SIZE); + return; + } + + m_fastmem_ram_views.push_back(std::move(view.value())); + }; + + if (!isolate_cache) + { + // KUSEG - cached + MapRAM(0x00000000); + // MapBIOS(0x1FC00000); + + // KSEG0 - cached + MapRAM(0x80000000); + // MapBIOS(0x9FC00000); + } + + // KSEG1 - uncached + MapRAM(0xA0000000); + // MapBIOS(0xBFC00000); +} + +bool IsRAMCodePage(u32 index) +{ + return m_ram_code_bits[index]; +} + +void SetRAMCodePage(u32 index) +{ + if (m_ram_code_bits[index]) + return; + + // protect fastmem pages + m_ram_code_bits[index] = true; + SetCodePageFastmemProtection(index, false); +} + +void ClearRAMCodePage(u32 index) +{ + if (!m_ram_code_bits[index]) + return; + + // unprotect fastmem pages + m_ram_code_bits[index] = false; + SetCodePageFastmemProtection(index, true); +} + +void SetCodePageFastmemProtection(u32 page_index, bool writable) +{ + // unprotect fastmem pages + for (const auto& view : m_fastmem_ram_views) + { + u8* page_address = static_cast(view.GetBasePointer()) + (page_index * CPU_CODE_CACHE_PAGE_SIZE); + if (!m_memory_arena.SetPageProtection(page_address, CPU_CODE_CACHE_PAGE_SIZE, true, writable, false)) + { + Log_ErrorPrintf("Failed to %s code page %u (0x%08X) @ %p", writable ? "unprotect" : "protect", page_index, + page_index * CPU_CODE_CACHE_PAGE_SIZE, page_address); + } + } +} + +void ClearRAMCodePageFlags() +{ + m_ram_code_bits.reset(); + + // unprotect fastmem pages + for (const auto& view : m_fastmem_ram_views) + { + if (!m_memory_arena.SetPageProtection(view.GetBasePointer(), view.GetMappingSize(), true, true, false)) + { + Log_ErrorPrintf("Failed to unprotect code pages for fastmem view @ %p", view.GetBasePointer()); + } + } +} + +bool IsCodePageAddress(PhysicalMemoryAddress address) +{ + return IsRAMAddress(address) ? m_ram_code_bits[(address & RAM_MASK) / CPU_CODE_CACHE_PAGE_SIZE] : false; +} + +bool HasCodePagesInRange(PhysicalMemoryAddress start_address, u32 size) +{ + if (!IsRAMAddress(start_address)) + return false; + + start_address = (start_address & RAM_MASK); + + const u32 end_address = start_address + size; + while (start_address < end_address) + { + const u32 code_page_index = start_address / CPU_CODE_CACHE_PAGE_SIZE; + if (m_ram_code_bits[code_page_index]) + return true; + + start_address += CPU_CODE_CACHE_PAGE_SIZE; + } + + return false; +} + + static TickCount DoInvalidAccess(MemoryAccessType type, MemoryAccessSize size, PhysicalMemoryAddress address, u32& value) { diff --git a/src/core/bus.h b/src/core/bus.h index d2f187ba6..ef1905bee 100644 --- a/src/core/bus.h +++ b/src/core/bus.h @@ -1,5 +1,6 @@ #pragma once #include "common/bitfield.h" +#include "common/memory_arena.h" #include "types.h" #include #include @@ -65,26 +66,69 @@ enum : u32 MEMCTRL_REG_COUNT = 9 }; -void Initialize(); +enum : TickCount +{ + RAM_READ_TICKS = 4 +}; + +enum : size_t +{ + // Our memory arena contains storage for RAM and BIOS. + MEMORY_ARENA_SIZE = RAM_SIZE + BIOS_SIZE, + + // Offsets within the memory arena. + MEMORY_ARENA_RAM_OFFSET = 0, + MEMORY_ARENA_BIOS_OFFSET = MEMORY_ARENA_RAM_OFFSET + RAM_SIZE, + + // Fastmem region size is 4GB to cover the entire 32-bit address space. + FASTMEM_REGION_SIZE = UINT64_C(0x100000000) +}; + +bool Initialize(); void Shutdown(); void Reset(); bool DoState(StateWrapper& sw); +u8* GetFastmemBase(); +bool AllocateMemory(); +void UpdateFastmemViews(bool enabled, bool isolate_cache); + void SetExpansionROM(std::vector data); void SetBIOS(const std::vector& image); extern std::bitset m_ram_code_bits; -extern u8 g_ram[RAM_SIZE]; // 2MB RAM -extern u8 g_bios[BIOS_SIZE]; // 512K BIOS ROM +extern u8* g_ram; // 2MB RAM +extern u8* g_bios; // 512K BIOS ROM + +/// Returns true if the address specified is writable (RAM). +ALWAYS_INLINE static bool IsRAMAddress(PhysicalMemoryAddress address) +{ + return address < RAM_MIRROR_END; +} + +/// Returns the code page index for a RAM address. +ALWAYS_INLINE static u32 GetRAMCodePageIndex(PhysicalMemoryAddress address) +{ + return (address & RAM_MASK) / CPU_CODE_CACHE_PAGE_SIZE; +} + +/// Returns true if the specified page contains code. +bool IsRAMCodePage(u32 index); /// Flags a RAM region as code, so we know when to invalidate blocks. -ALWAYS_INLINE void SetRAMCodePage(u32 index) { m_ram_code_bits[index] = true; } +void SetRAMCodePage(u32 index); /// Unflags a RAM region as code, the code cache will no longer be notified when writes occur. -ALWAYS_INLINE void ClearRAMCodePage(u32 index) { m_ram_code_bits[index] = false; } +void ClearRAMCodePage(u32 index); /// Clears all code bits for RAM regions. -ALWAYS_INLINE void ClearRAMCodePageFlags() { m_ram_code_bits.reset(); } +void ClearRAMCodePageFlags(); + +/// Returns true if the specified address is in a code page. +bool IsCodePageAddress(PhysicalMemoryAddress address); + +/// Returns true if the range specified overlaps with a code page. +bool HasCodePagesInRange(PhysicalMemoryAddress start_address, u32 size); /// Returns the number of cycles stolen by DMA RAM access. ALWAYS_INLINE TickCount GetDMARAMTickCount(u32 word_count) @@ -96,4 +140,4 @@ ALWAYS_INLINE TickCount GetDMARAMTickCount(u32 word_count) return static_cast(word_count + ((word_count + 15) / 16)); } -} // namespace Bus +} // namespace Bus \ No newline at end of file diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index e4a1a55e5..041dd81aa 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -5,6 +5,7 @@ #include "cpu_core.h" #include "cpu_core_private.h" #include "cpu_disasm.h" +#include "settings.h" #include "system.h" #include "timing_event.h" Log_SetChannel(CPU::CodeCache); @@ -61,6 +62,7 @@ static void SetFastMap(u32 pc, CodeBlock::HostCodePointer function) #endif using BlockMap = std::unordered_map; +using HostCodeMap = std::map; void LogCurrentState(); @@ -85,36 +87,49 @@ static void LinkBlock(CodeBlock* from, CodeBlock* to); /// Unlink all blocks which point to this block, and any that this block links to. static void UnlinkBlock(CodeBlock* block); -static bool s_use_recompiler = false; static BlockMap s_blocks; static std::array, CPU_CODE_CACHE_PAGE_COUNT> m_ram_block_map; -void Initialize(bool use_recompiler) +#ifdef WITH_RECOMPILER +static HostCodeMap s_host_code_map; + +static void AddBlockToHostCodeMap(CodeBlock* block); +static void RemoveBlockFromHostCodeMap(CodeBlock* block); +static bool InitializeFastmem(); +static void ShutdownFastmem(); +static Common::PageFaultHandler::HandlerResult PageFaultHandler(void* exception_pc, void* fault_address, bool is_write); +#endif + +void Initialize() { Assert(s_blocks.empty()); #ifdef WITH_RECOMPILER - s_use_recompiler = use_recompiler; -#ifdef USE_STATIC_CODE_BUFFER - if (!s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE, - RECOMPILER_GUARD_SIZE)) -#else - if (!s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE)) -#endif + if (g_settings.IsUsingRecompiler()) { - Panic("Failed to initialize code space"); - } - - ResetFastMap(); - CompileDispatcher(); +#ifdef USE_STATIC_CODE_BUFFER + if (!s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE, + RECOMPILER_GUARD_SIZE)) #else - s_use_recompiler = false; + if (!s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE)) +#endif + { + Panic("Failed to initialize code space"); + } + + if (g_settings.IsUsingFastmem() && !InitializeFastmem()) + Panic("Failed to initialize fastmem"); + + ResetFastMap(); + CompileDispatcher(); + } #endif } void Shutdown() { Flush(); + ShutdownFastmem(); #ifdef WITH_RECOMPILER s_code_buffer.Destroy(); #endif @@ -279,14 +294,33 @@ void ExecuteRecompiler() #endif -void SetUseRecompiler(bool enable) +void Reinitialize() { -#ifdef WITH_RECOMPILER - if (s_use_recompiler == enable) - return; - - s_use_recompiler = enable; Flush(); +#ifdef WITH_RECOMPILER + + ShutdownFastmem(); + s_code_buffer.Destroy(); + + if (g_settings.IsUsingRecompiler()) + { + +#ifdef USE_STATIC_CODE_BUFFER + if (!s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE, + RECOMPILER_GUARD_SIZE)) +#else + if (!s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE)) +#endif + { + Panic("Failed to initialize code space"); + } + + if (g_settings.IsUsingFastmem() && !InitializeFastmem()) + Panic("Failed to initialize fastmem"); + + ResetFastMap(); + CompileDispatcher(); + } #endif } @@ -298,8 +332,10 @@ void Flush() for (const auto& it : s_blocks) delete it.second; + s_blocks.clear(); #ifdef WITH_RECOMPILER + s_host_code_map.clear(); s_code_buffer.Reset(); ResetFastMap(); CompileDispatcher(); @@ -358,6 +394,8 @@ CodeBlock* LookupBlock(CodeBlockKey key) } iter = s_blocks.emplace(key.bits, block).first; + AddBlockToHostCodeMap(block); + return block; } @@ -384,6 +422,8 @@ bool RevalidateBlock(CodeBlock* block) return true; recompile: + RemoveBlockFromHostCodeMap(block); + block->instructions.clear(); if (!CompileBlock(block)) { @@ -393,6 +433,7 @@ recompile: } // re-add to page map again + AddBlockToHostCodeMap(block); if (block->IsInRAM()) AddBlockToPageMap(block); @@ -439,6 +480,9 @@ bool CompileBlock(CodeBlock* block) block->uncached_fetch_ticks += GetInstructionReadTicks(pc); } + block->contains_loadstore_instructions |= cbi.is_load_instruction; + block->contains_loadstore_instructions |= cbi.is_store_instruction; + // instruction is decoded now block->instructions.push_back(cbi); pc += sizeof(cbi.instruction.bits); @@ -481,7 +525,7 @@ bool CompileBlock(CodeBlock* block) } #ifdef WITH_RECOMPILER - if (s_use_recompiler) + if (g_settings.IsUsingRecompiler()) { // Ensure we're not going to run out of space while compiling this block. if (s_code_buffer.GetFreeCodeSpace() < @@ -552,6 +596,9 @@ void FlushBlock(CodeBlock* block) RemoveBlockFromPageMap(block); UnlinkBlock(block); +#ifdef WITH_RECOMPILER + RemoveBlockFromHostCodeMap(block); +#endif s_blocks.erase(iter); delete block; @@ -613,4 +660,107 @@ void UnlinkBlock(CodeBlock* block) block->link_successors.clear(); } +#ifdef WITH_RECOMPILER + +void AddBlockToHostCodeMap(CodeBlock* block) +{ + if (!g_settings.IsUsingRecompiler()) + return; + + auto ir = s_host_code_map.emplace(block->host_code, block); + Assert(ir.second); +} + +void RemoveBlockFromHostCodeMap(CodeBlock* block) +{ + if (!g_settings.IsUsingRecompiler()) + return; + + HostCodeMap::iterator hc_iter = s_host_code_map.find(block->host_code); + Assert(hc_iter != s_host_code_map.end()); + s_host_code_map.erase(hc_iter); +} + +bool InitializeFastmem() +{ + if (!Common::PageFaultHandler::InstallHandler(&s_host_code_map, PageFaultHandler)) + { + Log_ErrorPrintf("Failed to install page fault handler"); + return false; + } + + Bus::UpdateFastmemViews(true, g_state.cop0_regs.sr.Isc); + return true; +} + +void ShutdownFastmem() +{ + Common::PageFaultHandler::RemoveHandler(&s_host_code_map); + Bus::UpdateFastmemViews(false, false); +} + +Common::PageFaultHandler::HandlerResult PageFaultHandler(void* exception_pc, void* fault_address, bool is_write) +{ + if (static_cast(fault_address) < g_state.fastmem_base || + (static_cast(fault_address) - g_state.fastmem_base) >= Bus::FASTMEM_REGION_SIZE) + { + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + } + + const PhysicalMemoryAddress fastmem_address = + static_cast(static_cast(static_cast(fault_address) - g_state.fastmem_base)); + + Log_DevPrintf("Page fault handler invoked at PC=%p Address=%p %s, fastmem offset 0x%08X", exception_pc, fault_address, + is_write ? "(write)" : "(read)", fastmem_address); + + if (is_write && !g_state.cop0_regs.sr.Isc && Bus::IsRAMAddress(fastmem_address)) + { + // this is probably a code page, since we aren't going to fault due to requiring fastmem on RAM. + const u32 code_page_index = Bus::GetRAMCodePageIndex(fastmem_address); + if (Bus::IsRAMCodePage(code_page_index)) + { + InvalidateBlocksWithPageIndex(code_page_index); + return Common::PageFaultHandler::HandlerResult::ContinueExecution; + } + } + + // use upper_bound to find the next block after the pc + HostCodeMap::iterator upper_iter = + s_host_code_map.upper_bound(reinterpret_cast(exception_pc)); + if (upper_iter == s_host_code_map.begin()) + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + + // then decrement it by one to (hopefully) get the block we want + upper_iter--; + + // find the loadstore info in the code block + CodeBlock* block = upper_iter->second; + for (auto bpi_iter = block->loadstore_backpatch_info.begin(); bpi_iter != block->loadstore_backpatch_info.end(); + ++bpi_iter) + { + const Recompiler::LoadStoreBackpatchInfo& lbi = *bpi_iter; + if (lbi.host_pc == exception_pc) + { + // found it, do fixup + if (Recompiler::CodeGenerator::BackpatchLoadStore(lbi)) + { + // remove the backpatch entry since we won't be coming back to this one + block->loadstore_backpatch_info.erase(bpi_iter); + return Common::PageFaultHandler::HandlerResult::ContinueExecution; + } + else + { + Log_ErrorPrintf("Failed to backpatch %p in block 0x%08X", exception_pc, block->GetPC()); + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + } + } + } + + // we didn't find the pc in our list.. + Log_ErrorPrintf("Loadstore PC not found for %p in block 0x%08X", exception_pc, block->GetPC()); + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; +} + +#endif + } // namespace CPU::CodeCache diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h index 5d285191b..92a8b0d0b 100644 --- a/src/core/cpu_code_cache.h +++ b/src/core/cpu_code_cache.h @@ -2,12 +2,18 @@ #include "bus.h" #include "common/bitfield.h" #include "common/jit_code_buffer.h" +#include "common/page_fault_handler.h" #include "cpu_types.h" #include +#include #include #include #include +#ifdef WITH_RECOMPILER +#include "cpu_recompiler_types.h" +#endif + namespace CPU { enum : u32 @@ -71,6 +77,12 @@ struct CodeBlock TickCount uncached_fetch_ticks = 0; u32 icache_line_count = 0; + +#ifdef WITH_RECOMPILER + std::vector loadstore_backpatch_info; +#endif + + bool contains_loadstore_instructions = false; bool invalidated = false; const u32 GetPC() const { return key.GetPC(); } @@ -89,7 +101,7 @@ struct CodeBlock namespace CodeCache { -void Initialize(bool use_recompiler); +void Initialize(); void Shutdown(); void Execute(); @@ -102,7 +114,7 @@ void ExecuteRecompiler(); void Flush(); /// Changes whether the recompiler is enabled. -void SetUseRecompiler(bool enable); +void Reinitialize(); /// Invalidates all blocks which are in the range of the specified code page. void InvalidateBlocksWithPageIndex(u32 page_index); diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 0d88e465b..a26e8cc60 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -1,4 +1,5 @@ #include "cpu_core.h" +#include "bus.h" #include "common/align.h" #include "common/file_system.h" #include "common/log.h" @@ -1563,6 +1564,11 @@ bool InterpretInstructionPGXP() return g_state.exception_raised; } +void UpdateFastmemMapping() +{ + Bus::UpdateFastmemViews(true, g_state.cop0_regs.sr.Isc); +} + } // namespace Recompiler::Thunks } // namespace CPU \ No newline at end of file diff --git a/src/core/cpu_core.h b/src/core/cpu_core.h index 43c14c99a..58d396d30 100644 --- a/src/core/cpu_core.h +++ b/src/core/cpu_core.h @@ -79,6 +79,8 @@ struct State // GTE registers are stored here so we can access them on ARM with a single instruction GTE::Regs gte_regs = {}; + u8* fastmem_base = nullptr; + // data cache (used as scratchpad) std::array dcache = {}; std::array icache_tags = {}; diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index b3f1bf4cb..084e2f001 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -19,8 +19,7 @@ u32 CodeGenerator::CalculateRegisterOffset(Reg reg) return u32(offsetof(State, regs.r[0]) + (static_cast(reg) * sizeof(u32))); } -bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, - u32* out_host_code_size) +bool CodeGenerator::CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size) { // TODO: Align code buffer. @@ -40,8 +39,10 @@ bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePoin Log_DebugPrintf("Compiling instruction '%s'", disasm.GetCharArray()); #endif + m_current_instruction = cbi; if (!CompileInstruction(*cbi)) { + m_current_instruction = nullptr; m_block_end = nullptr; m_block_start = nullptr; m_block = nullptr; @@ -60,6 +61,7 @@ bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePoin DebugAssert(m_register_cache.GetUsedHostRegisters() == 0); + m_current_instruction = nullptr; m_block_end = nullptr; m_block_start = nullptr; m_block = nullptr; @@ -1895,7 +1897,22 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) value = AndValues(value, Value::FromConstantU32(write_mask)); } - EmitStoreCPUStructField(offset, value); + // changing SR[Isc] needs to update fastmem views + if (reg == Cop0Reg::SR && g_settings.cpu_fastmem) + { + LabelType skip_fastmem_update; + Value old_value = m_register_cache.AllocateScratch(RegSize_32); + EmitLoadCPUStructField(old_value.host_reg, RegSize_32, offset); + EmitStoreCPUStructField(offset, value); + EmitXor(old_value.host_reg, old_value.host_reg, value); + EmitBranchIfBitClear(old_value.host_reg, RegSize_32, 16, &skip_fastmem_update); + EmitFunctionCall(nullptr, &Thunks::UpdateFastmemMapping, m_register_cache.GetCPUPtr()); + EmitBindLabel(&skip_fastmem_update); + } + else + { + EmitStoreCPUStructField(offset, value); + } } } diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index 699c50d99..a5138a7dd 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -23,7 +23,9 @@ public: static const char* GetHostRegName(HostReg reg, RegSize size = HostPointerSize); static void AlignCodeBuffer(JitCodeBuffer* code_buffer); - bool CompileBlock(const CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); + static bool BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi); + + bool CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); CodeBlock::HostCodePointer CompileDispatcher(); @@ -73,7 +75,11 @@ public: // Automatically generates an exception handler. Value EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, RegSize size); + void EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, Value& result); + void EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, Value& result, bool in_far_code); void EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const Value& address, const Value& value); + void EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, const Value& value); + void EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, const Value& value, bool in_far_code); // Unconditional branch to pointer. May allocate a scratch register. void EmitBranch(const void* address, bool allow_scratch = true); @@ -204,9 +210,10 @@ private: bool Compile_cop2(const CodeBlockInstruction& cbi); JitCodeBuffer* m_code_buffer; - const CodeBlock* m_block = nullptr; + CodeBlock* m_block = nullptr; const CodeBlockInstruction* m_block_start = nullptr; const CodeBlockInstruction* m_block_end = nullptr; + const CodeBlockInstruction* m_current_instruction = nullptr; RegisterCache m_register_cache; CodeEmitter m_near_emitter; CodeEmitter m_far_emitter; diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 3225be0ab..af1db51cb 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -14,6 +14,7 @@ namespace a64 = vixl::aarch64; namespace CPU::Recompiler { constexpr HostReg RCPUPTR = 19; +constexpr HostReg RMEMBASEPTR = 20; constexpr HostReg RRETURN = 0; constexpr HostReg RARG1 = 0; constexpr HostReg RARG2 = 1; @@ -86,6 +87,11 @@ static const a64::XRegister GetCPUPtrReg() return GetHostReg64(RCPUPTR); } +static const a64::XRegister GetFastmemBasePtrReg() +{ + return GetHostReg64(RMEMBASEPTR); +} + CodeGenerator::CodeGenerator(JitCodeBuffer* code_buffer) : m_code_buffer(code_buffer), m_register_cache(*this), m_near_emitter(static_cast(code_buffer->GetFreeCodePointer()), code_buffer->GetFreeCodeSpace(), @@ -188,10 +194,21 @@ void CodeGenerator::EmitBeginBlock() // Store the CPU struct pointer. TODO: make this better. const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); + + // If there's loadstore instructions, preload the fastmem base. + if (m_block->contains_loadstore_instructions) + { + const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); + Assert(fastmem_reg_allocated); + m_emit->Ldr(GetFastmemBasePtrReg(), a64::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base))); + } } void CodeGenerator::EmitEndBlock() { + if (m_block->contains_loadstore_instructions) + m_register_cache.FreeHostReg(RMEMBASEPTR); + m_register_cache.FreeHostReg(RCPUPTR); m_register_cache.PopCalleeSavedRegisters(true); @@ -1308,12 +1325,105 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const AddPendingCycles(true); + Value result = m_register_cache.AllocateScratch(RegSize_64); + if (g_settings.IsUsingFastmem()) + { + EmitLoadGuestMemoryFastmem(cbi, address, size, result); + } + else + { + m_register_cache.FlushCallerSavedGuestRegisters(true, true); + EmitLoadGuestMemorySlowmem(cbi, address, size, result, false); + } + + // Downcast to ignore upper 56/48/32 bits. This should be a noop. + switch (size) + { + case RegSize_8: + ConvertValueSizeInPlace(&result, RegSize_8, false); + break; + + case RegSize_16: + ConvertValueSizeInPlace(&result, RegSize_16, false); + break; + + case RegSize_32: + ConvertValueSizeInPlace(&result, RegSize_32, false); + break; + + default: + UnreachableCode(); + break; + } + + return result; +} + +void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, + Value& result) +{ + // fastmem + LoadStoreBackpatchInfo bpi; + bpi.host_pc = GetCurrentNearCodePointer(); + bpi.address_host_reg = HostReg_Invalid; + bpi.value_host_reg = result.host_reg; + bpi.guest_pc = m_current_instruction->pc; + + a64::MemOperand actual_address; + if (address.IsConstant()) + { + m_emit->Mov(GetHostReg32(result.host_reg), address.constant_value); + actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(result.host_reg)); + bpi.host_pc = GetCurrentNearCodePointer(); + } + else + { + actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(address)); + } + + // TODO: movsx/zx inline here + switch (size) + { + case RegSize_8: + m_emit->Ldrb(GetHostReg32(result.host_reg), actual_address); + break; + + case RegSize_16: + m_emit->Ldrh(GetHostReg32(result.host_reg), actual_address); + break; + + case RegSize_32: + m_emit->Ldr(GetHostReg32(result.host_reg), actual_address); + break; + + default: + UnreachableCode(); + break; + } + + EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS)); + + bpi.host_code_size = static_cast( + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + + // generate slowmem fallback + bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + SwitchToFarCode(); + EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); + + // return to the block code + EmitBranch(GetCurrentNearCodePointer(), false); + + SwitchToNearCode(); + + m_block->loadstore_backpatch_info.push_back(bpi); +} + +void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, + Value& result, bool in_far_code) +{ if (g_settings.cpu_recompiler_memory_exceptions) { - // We need to use the full 64 bits here since we test the sign bit result. - Value result = m_register_cache.AllocateScratch(RegSize_64); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - // NOTE: This can leave junk in the upper bits switch (size) { @@ -1342,7 +1452,8 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const m_emit->Bind(&load_okay); // load exception path - SwitchToFarCode(); + if (!in_far_code) + SwitchToFarCode(); // cause_bits = (-result << 2) | BD | cop_n m_emit->neg(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg)); @@ -1353,37 +1464,14 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); EmitExceptionExit(); - SwitchToNearCode(); + + if (!in_far_code) + SwitchToNearCode(); m_register_cache.PopState(); - - // Downcast to ignore upper 56/48/32 bits. This should be a noop. - switch (size) - { - case RegSize_8: - ConvertValueSizeInPlace(&result, RegSize_8, false); - break; - - case RegSize_16: - ConvertValueSizeInPlace(&result, RegSize_16, false); - break; - - case RegSize_32: - ConvertValueSizeInPlace(&result, RegSize_32, false); - break; - - default: - UnreachableCode(); - break; - } - - return result; } else { - Value result = m_register_cache.AllocateScratch(RegSize_32); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - switch (size) { case RegSize_8: @@ -1402,27 +1490,6 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const UnreachableCode(); break; } - - // Downcast to ignore upper 56/48/32 bits. This should be a noop. - switch (size) - { - case RegSize_8: - ConvertValueSizeInPlace(&result, RegSize_8, false); - break; - - case RegSize_16: - ConvertValueSizeInPlace(&result, RegSize_16, false); - break; - - case RegSize_32: - break; - - default: - UnreachableCode(); - break; - } - - return result; } } @@ -1443,11 +1510,87 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const AddPendingCycles(true); + if (g_settings.IsUsingFastmem()) + { + // we need the value in a host register to store it + Value value_in_hr = GetValueInHostRegister(value); + EmitStoreGuestMemoryFastmem(cbi, address, value_in_hr); + } + else + { + m_register_cache.FlushCallerSavedGuestRegisters(true, true); + EmitStoreGuestMemorySlowmem(cbi, address, value, false); + } +} + +void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, + const Value& value) +{ + // fastmem + LoadStoreBackpatchInfo bpi; + bpi.host_pc = GetCurrentNearCodePointer(); + bpi.address_host_reg = HostReg_Invalid; + bpi.value_host_reg = value.host_reg; + bpi.guest_pc = m_current_instruction->pc; + + a64::MemOperand actual_address; + if (address.IsConstant()) + { + m_emit->Mov(GetHostReg32(RSCRATCH), address.constant_value); + actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RSCRATCH)); + bpi.host_pc = GetCurrentNearCodePointer(); + } + else + { + actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(address)); + } + + switch (value.size) + { + case RegSize_8: + m_emit->Strb(GetHostReg8(value), actual_address); + break; + + case RegSize_16: + m_emit->Strh(GetHostReg16(value), actual_address); + break; + + case RegSize_32: + m_emit->Str(GetHostReg32(value), actual_address); + break; + + default: + UnreachableCode(); + break; + } + + bpi.host_code_size = static_cast( + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + + // generate slowmem fallback + bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + SwitchToFarCode(); + + EmitStoreGuestMemorySlowmem(cbi, address, value, true); + + // return to the block code + EmitBranch(GetCurrentNearCodePointer(), false); + + SwitchToNearCode(); + + m_block->loadstore_backpatch_info.push_back(bpi); +} + +void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, + const Value& value, bool in_far_code) +{ + AddPendingCycles(true); + if (g_settings.cpu_recompiler_memory_exceptions) { - Value result = m_register_cache.AllocateScratch(RegSize_32); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); + Assert(!in_far_code); + Value result = m_register_cache.AllocateScratch(RegSize_32); switch (value.size) { case RegSize_8: @@ -1475,7 +1618,8 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const m_emit->Bind(&store_okay); // store exception path - SwitchToFarCode(); + if (!in_far_code) + SwitchToFarCode(); // cause_bits = (result << 2) | BD | cop_n m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2); @@ -1484,15 +1628,14 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const static_cast(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n))); EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); - EmitExceptionExit(); + if (!in_far_code) + EmitExceptionExit(); SwitchToNearCode(); m_register_cache.PopState(); } else { - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - switch (value.size) { case RegSize_8: @@ -1514,6 +1657,30 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const } } +bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) +{ + Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem at %p", lbi.host_pc, lbi.guest_pc, lbi.host_slowmem_pc); + + // check jump distance + const s64 jump_distance = + static_cast(reinterpret_cast(lbi.host_slowmem_pc) - reinterpret_cast(lbi.host_pc)); + Assert(Common::IsAligned(jump_distance, 4)); + Assert(a64::Instruction::IsValidImmPCOffset(a64::UncondBranchType, jump_distance >> 2)); + + // turn it into a jump to the slowmem handler + vixl::aarch64::MacroAssembler emit(static_cast(lbi.host_pc), lbi.host_code_size, + a64::PositionDependentCode); + emit.b(jump_distance >> 2); + + const s32 nops = (static_cast(lbi.host_code_size) - static_cast(emit.GetCursorOffset())) / 4; + Assert(nops >= 0); + for (s32 i = 0; i < nops; i++) + emit.nop(); + + JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size); + return true; +} + void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { EmitLoadGlobalAddress(RSCRATCH, ptr); diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 5770b4f4f..cb42a4e30 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -1,4 +1,5 @@ #include "common/align.h" +#include "common/assert.h" #include "common/log.h" #include "cpu_core.h" #include "cpu_core_private.h" @@ -12,6 +13,7 @@ namespace CPU::Recompiler { #if defined(ABI_WIN64) constexpr HostReg RCPUPTR = Xbyak::Operand::RBP; +constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX; constexpr HostReg RRETURN = Xbyak::Operand::RAX; constexpr HostReg RARG1 = Xbyak::Operand::RCX; constexpr HostReg RARG2 = Xbyak::Operand::RDX; @@ -21,6 +23,7 @@ constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 32; constexpr u64 FUNCTION_CALL_STACK_ALIGNMENT = 16; #elif defined(ABI_SYSV) constexpr HostReg RCPUPTR = Xbyak::Operand::RBP; +constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX; constexpr HostReg RRETURN = Xbyak::Operand::RAX; constexpr HostReg RARG1 = Xbyak::Operand::RDI; constexpr HostReg RARG2 = Xbyak::Operand::RSI; @@ -79,6 +82,11 @@ static const Xbyak::Reg64 GetCPUPtrReg() return GetHostReg64(RCPUPTR); } +static const Xbyak::Reg64 GetFastmemBasePtrReg() +{ + return GetHostReg64(RMEMBASEPTR); +} + CodeGenerator::CodeGenerator(JitCodeBuffer* code_buffer) : m_code_buffer(code_buffer), m_register_cache(*this), m_near_emitter(code_buffer->GetFreeCodeSpace(), code_buffer->GetFreeCodePointer()), @@ -140,7 +148,6 @@ void CodeGenerator::InitHostRegs() m_register_cache.SetCalleeSavedHostRegs({Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::RDI, Xbyak::Operand::RSI, Xbyak::Operand::RSP, Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15}); - m_register_cache.SetCPUPtrHostReg(RCPUPTR); #elif defined(ABI_SYSV) m_register_cache.SetHostRegAllocationOrder( {Xbyak::Operand::RBX, /*Xbyak::Operand::RSP, */ Xbyak::Operand::RBP, Xbyak::Operand::R12, Xbyak::Operand::R13, @@ -154,8 +161,9 @@ void CodeGenerator::InitHostRegs() m_register_cache.SetCalleeSavedHostRegs({Xbyak::Operand::RBX, Xbyak::Operand::RSP, Xbyak::Operand::RBP, Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15}); - m_register_cache.SetCPUPtrHostReg(RCPUPTR); #endif + + m_register_cache.SetCPUPtrHostReg(RCPUPTR); } void CodeGenerator::SwitchToFarCode() @@ -196,11 +204,22 @@ void CodeGenerator::EmitBeginBlock() const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); // m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); + + // If there's loadstore instructions, preload the fastmem base. + if (m_block->contains_loadstore_instructions) + { + const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); + Assert(fastmem_reg_allocated); + m_emit->mov(GetFastmemBasePtrReg(), m_emit->qword[GetCPUPtrReg() + offsetof(CPU::State, fastmem_base)]); + } } void CodeGenerator::EmitEndBlock() { m_register_cache.FreeHostReg(RCPUPTR); + if (m_block->contains_loadstore_instructions) + m_register_cache.FreeHostReg(RMEMBASEPTR); + m_register_cache.PopCalleeSavedRegisters(true); m_emit->ret(); @@ -1762,12 +1781,139 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const AddPendingCycles(true); + Value result = m_register_cache.AllocateScratch(RegSize_64); + if (g_settings.IsUsingFastmem()) + { + EmitLoadGuestMemoryFastmem(cbi, address, size, result); + } + else + { + m_register_cache.FlushCallerSavedGuestRegisters(true, true); + EmitLoadGuestMemorySlowmem(cbi, address, size, result, false); + } + + // Downcast to ignore upper 56/48/32 bits. This should be a noop. + switch (size) + { + case RegSize_8: + ConvertValueSizeInPlace(&result, RegSize_8, false); + break; + + case RegSize_16: + ConvertValueSizeInPlace(&result, RegSize_16, false); + break; + + case RegSize_32: + ConvertValueSizeInPlace(&result, RegSize_32, false); + break; + + default: + UnreachableCode(); + break; + } + + return result; +} + +void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, + Value& result) +{ + // fastmem + LoadStoreBackpatchInfo bpi; + bpi.host_pc = GetCurrentNearCodePointer(); + bpi.address_host_reg = HostReg_Invalid; + bpi.value_host_reg = result.host_reg; + bpi.guest_pc = m_current_instruction->pc; + + // can't store displacements > 0x80000000 in-line + const Value* actual_address = &address; + if (address.IsConstant() && address.constant_value >= 0x80000000) + { + actual_address = &result; + m_emit->mov(GetHostReg32(result.host_reg), address.constant_value); + bpi.host_pc = GetCurrentNearCodePointer(); + } + + // TODO: movsx/zx inline here + switch (size) + { + case RegSize_8: + { + if (actual_address->IsConstant()) + { + m_emit->mov(GetHostReg8(result.host_reg), + m_emit->byte[GetFastmemBasePtrReg() + actual_address->constant_value]); + } + else + { + m_emit->mov(GetHostReg8(result.host_reg), + m_emit->byte[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)]); + } + } + break; + + case RegSize_16: + { + if (actual_address->IsConstant()) + { + m_emit->mov(GetHostReg16(result.host_reg), + m_emit->word[GetFastmemBasePtrReg() + actual_address->constant_value]); + } + else + { + m_emit->mov(GetHostReg16(result.host_reg), + m_emit->word[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)]); + } + } + break; + + case RegSize_32: + { + if (actual_address->IsConstant()) + { + m_emit->mov(GetHostReg32(result.host_reg), + m_emit->dword[GetFastmemBasePtrReg() + actual_address->constant_value]); + } + else + { + m_emit->mov(GetHostReg32(result.host_reg), + m_emit->dword[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)]); + } + } + break; + } + + // TODO: BIOS reads... + EmitAddCPUStructField(offsetof(CPU::State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS)); + + // insert nops, we need at least 5 bytes for a relative jump + const u32 fastmem_size = + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc)); + const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0); + for (u32 i = 0; i < nops; i++) + m_emit->nop(); + + bpi.host_code_size = static_cast( + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + + // generate slowmem fallback + bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + SwitchToFarCode(); + EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); + + // return to the block code + m_emit->jmp(GetCurrentNearCodePointer()); + + SwitchToNearCode(); + + m_block->loadstore_backpatch_info.push_back(bpi); +} + +void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, + Value& result, bool in_far_code) +{ if (g_settings.cpu_recompiler_memory_exceptions) { - // We need to use the full 64 bits here since we test the sign bit result. - Value result = m_register_cache.AllocateScratch(RegSize_64); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - // NOTE: This can leave junk in the upper bits switch (size) { @@ -1794,7 +1940,8 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const m_register_cache.PushState(); // load exception path - SwitchToFarCode(); + if (!in_far_code) + SwitchToFarCode(); // cause_bits = (-result << 2) | BD | cop_n m_emit->neg(GetHostReg32(result.host_reg)); @@ -1805,37 +1952,14 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); EmitExceptionExit(); - SwitchToNearCode(); + + if (!in_far_code) + SwitchToNearCode(); m_register_cache.PopState(); - - // Downcast to ignore upper 56/48/32 bits. This should be a noop. - switch (size) - { - case RegSize_8: - ConvertValueSizeInPlace(&result, RegSize_8, false); - break; - - case RegSize_16: - ConvertValueSizeInPlace(&result, RegSize_16, false); - break; - - case RegSize_32: - ConvertValueSizeInPlace(&result, RegSize_32, false); - break; - - default: - UnreachableCode(); - break; - } - - return result; } else { - Value result = m_register_cache.AllocateScratch(RegSize_32); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - switch (size) { case RegSize_8: @@ -1854,27 +1978,6 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const UnreachableCode(); break; } - - // Downcast to ignore upper 56/48/32 bits. This should be a noop. - switch (size) - { - case RegSize_8: - ConvertValueSizeInPlace(&result, RegSize_8, false); - break; - - case RegSize_16: - ConvertValueSizeInPlace(&result, RegSize_16, false); - break; - - case RegSize_32: - break; - - default: - UnreachableCode(); - break; - } - - return result; } } @@ -1895,11 +1998,163 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const AddPendingCycles(true); + if (g_settings.IsUsingFastmem()) + { + EmitStoreGuestMemoryFastmem(cbi, address, value); + } + else + { + m_register_cache.FlushCallerSavedGuestRegisters(true, true); + EmitStoreGuestMemorySlowmem(cbi, address, value, false); + } +} + +void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, + const Value& value) +{ + // fastmem + LoadStoreBackpatchInfo bpi; + bpi.host_pc = GetCurrentNearCodePointer(); + bpi.address_host_reg = HostReg_Invalid; + bpi.value_host_reg = value.host_reg; + bpi.guest_pc = m_current_instruction->pc; + + // can't store displacements > 0x80000000 in-line + const Value* actual_address = &address; + Value temp_address; + if (address.IsConstant() && address.constant_value >= 0x80000000) + { + temp_address.SetHostReg(&m_register_cache, RRETURN, RegSize_32); + actual_address = &temp_address; + m_emit->mov(GetHostReg32(temp_address), address.constant_value); + bpi.host_pc = GetCurrentNearCodePointer(); + } + + switch (value.size) + { + case RegSize_8: + { + if (actual_address->IsConstant()) + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + actual_address->constant_value], value.constant_value); + } + else + { + m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + actual_address->constant_value], + GetHostReg8(value.host_reg)); + } + } + else + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + value.constant_value); + } + else + { + m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + GetHostReg8(value.host_reg)); + } + } + } + break; + + case RegSize_16: + { + if (actual_address->IsConstant()) + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + actual_address->constant_value], value.constant_value); + } + else + { + m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + actual_address->constant_value], + GetHostReg16(value.host_reg)); + } + } + else + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + value.constant_value); + } + else + { + m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + GetHostReg16(value.host_reg)); + } + } + } + break; + + case RegSize_32: + { + if (actual_address->IsConstant()) + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + actual_address->constant_value], value.constant_value); + } + else + { + m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + actual_address->constant_value], + GetHostReg32(value.host_reg)); + } + } + else + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + value.constant_value); + } + else + { + m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + GetHostReg32(value.host_reg)); + } + } + } + break; + } + + // insert nops, we need at least 5 bytes for a relative jump + const u32 fastmem_size = + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc)); + const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0); + for (u32 i = 0; i < nops; i++) + m_emit->nop(); + + bpi.host_code_size = static_cast( + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + + // generate slowmem fallback + bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + SwitchToFarCode(); + + EmitStoreGuestMemorySlowmem(cbi, address, value, true); + + // return to the block code + m_emit->jmp(GetCurrentNearCodePointer()); + + SwitchToNearCode(); + + m_block->loadstore_backpatch_info.push_back(bpi); +} + +void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, + const Value& value, bool in_far_code) +{ if (g_settings.cpu_recompiler_memory_exceptions) { - Value result = m_register_cache.AllocateScratch(RegSize_32); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); + Assert(!in_far_code); + Value result = m_register_cache.AllocateScratch(RegSize_32); switch (value.size) { case RegSize_8: @@ -1925,24 +2180,24 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const m_emit->jnz(GetCurrentFarCodePointer()); // store exception path - SwitchToFarCode(); + if (!in_far_code) + SwitchToFarCode(); // cause_bits = (result << 2) | BD | cop_n - m_emit->shl(GetHostReg32(result.host_reg), 2); - m_emit->or_(GetHostReg32(result.host_reg), + m_emit->shl(GetHostReg32(result), 2); + m_emit->or_(GetHostReg32(result), Cop0Registers::CAUSE::MakeValueForException(static_cast(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n)); EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); EmitExceptionExit(); - SwitchToNearCode(); + if (!in_far_code) + SwitchToNearCode(); m_register_cache.PopState(); } else { - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - switch (value.size) { case RegSize_8: @@ -1964,6 +2219,24 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const } } +bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) +{ + Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem", lbi.host_pc, lbi.guest_pc); + + // turn it into a jump to the slowmem handler + Xbyak::CodeGenerator cg(lbi.host_code_size, lbi.host_pc); + cg.jmp(lbi.host_slowmem_pc); + + const s32 nops = static_cast(lbi.host_code_size) - + static_cast(static_cast(cg.getCurr() - static_cast(lbi.host_pc))); + Assert(nops >= 0); + for (s32 i = 0; i < nops; i++) + cg.nop(); + + JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size); + return true; +} + void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { const s64 displacement = diff --git a/src/core/cpu_recompiler_thunks.h b/src/core/cpu_recompiler_thunks.h index f698a859d..b9f5ced77 100644 --- a/src/core/cpu_recompiler_thunks.h +++ b/src/core/cpu_recompiler_thunks.h @@ -32,6 +32,7 @@ void UncheckedWriteMemoryByte(u32 address, u8 value); void UncheckedWriteMemoryHalfWord(u32 address, u16 value); void UncheckedWriteMemoryWord(u32 address, u32 value); +void UpdateFastmemMapping(); } // namespace Recompiler::Thunks diff --git a/src/core/cpu_recompiler_types.h b/src/core/cpu_recompiler_types.h index 9bb224223..3a8f1bc3c 100644 --- a/src/core/cpu_recompiler_types.h +++ b/src/core/cpu_recompiler_types.h @@ -127,6 +127,16 @@ constexpr bool SHIFTS_ARE_IMPLICITLY_MASKED = false; #endif +struct LoadStoreBackpatchInfo +{ + void* host_pc; // pointer to instruction which will fault + void* host_slowmem_pc; // pointer to slowmem callback code + u32 host_code_size; // size of the fastmem load as well as the add for cycles + HostReg address_host_reg; // register containing the guest address to load/store + HostReg value_host_reg; // register containing the source/destination + PhysicalMemoryAddress guest_pc; +}; + } // namespace Recompiler } // namespace CPU diff --git a/src/core/host_interface.cpp b/src/core/host_interface.cpp index 600b94bd5..7dc243181 100644 --- a/src/core/host_interface.cpp +++ b/src/core/host_interface.cpp @@ -366,6 +366,7 @@ void HostInterface::SetDefaultSettings(SettingsInterface& si) si.SetStringValue("CPU", "ExecutionMode", Settings::GetCPUExecutionModeName(Settings::DEFAULT_CPU_EXECUTION_MODE)); si.SetBoolValue("CPU", "RecompilerMemoryExceptions", false); si.SetBoolValue("CPU", "ICache", false); + si.SetBoolValue("CPU", "Fastmem", true); si.SetStringValue("GPU", "Renderer", Settings::GetRendererName(Settings::DEFAULT_GPU_RENDERER)); si.SetIntValue("GPU", "ResolutionScale", 1); @@ -512,12 +513,13 @@ void HostInterface::CheckForSettingsChanges(const Settings& old_settings) if (g_settings.emulation_speed != old_settings.emulation_speed) System::UpdateThrottlePeriod(); - if (g_settings.cpu_execution_mode != old_settings.cpu_execution_mode) + if (g_settings.cpu_execution_mode != old_settings.cpu_execution_mode || + g_settings.cpu_fastmem != old_settings.cpu_fastmem) { - AddFormattedOSDMessage(5.0f, "Switching to %s CPU execution mode.", - Settings::GetCPUExecutionModeName(g_settings.cpu_execution_mode)); - CPU::CodeCache::SetUseRecompiler(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler); - CPU::CodeCache::Flush(); + AddFormattedOSDMessage(5.0f, "Switching to %s CPU execution mode%s.", + Settings::GetCPUExecutionModeName(g_settings.cpu_execution_mode), + g_settings.cpu_fastmem ? " (fastmem)" : ""); + CPU::CodeCache::Reinitialize(); CPU::ClearICache(); } diff --git a/src/core/settings.cpp b/src/core/settings.cpp index d62439e34..135e8a522 100644 --- a/src/core/settings.cpp +++ b/src/core/settings.cpp @@ -96,6 +96,7 @@ void Settings::Load(SettingsInterface& si) .value_or(DEFAULT_CPU_EXECUTION_MODE); cpu_recompiler_memory_exceptions = si.GetBoolValue("CPU", "RecompilerMemoryExceptions", false); cpu_recompiler_icache = si.GetBoolValue("CPU", "RecompilerICache", false); + cpu_fastmem = si.GetBoolValue("CPU", "Fastmem", true); gpu_renderer = ParseRendererName(si.GetStringValue("GPU", "Renderer", GetRendererName(DEFAULT_GPU_RENDERER)).c_str()) .value_or(DEFAULT_GPU_RENDERER); @@ -217,6 +218,7 @@ void Settings::Save(SettingsInterface& si) const si.SetStringValue("CPU", "ExecutionMode", GetCPUExecutionModeName(cpu_execution_mode)); si.SetBoolValue("CPU", "RecompilerMemoryExceptions", cpu_recompiler_memory_exceptions); si.SetBoolValue("CPU", "RecompilerICache", cpu_recompiler_icache); + si.SetBoolValue("CPU", "Fastmem", cpu_fastmem); si.SetStringValue("GPU", "Renderer", GetRendererName(gpu_renderer)); si.SetStringValue("GPU", "Adapter", gpu_adapter.c_str()); diff --git a/src/core/settings.h b/src/core/settings.h index 08373fd1c..b1dfa5978 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -72,6 +72,7 @@ struct Settings CPUExecutionMode cpu_execution_mode = CPUExecutionMode::Interpreter; bool cpu_recompiler_memory_exceptions = false; bool cpu_recompiler_icache = false; + bool cpu_fastmem = true; float emulation_speed = 1.0f; bool speed_limiter_enabled = true; @@ -172,6 +173,11 @@ struct Settings return gpu_pgxp_enable ? (gpu_pgxp_cpu ? PGXPMode::CPU : PGXPMode::Memory) : PGXPMode::Disabled; } + ALWAYS_INLINE bool IsUsingFastmem() const + { + return (cpu_fastmem && cpu_execution_mode == CPUExecutionMode::Recompiler && !cpu_recompiler_memory_exceptions); + } + bool HasAnyPerGameMemoryCards() const; enum : u32 diff --git a/src/core/system.cpp b/src/core/system.cpp index b861fada4..fd2965da6 100644 --- a/src/core/system.cpp +++ b/src/core/system.cpp @@ -708,14 +708,16 @@ bool Initialize(bool force_software_renderer) TimingEvents::Initialize(); CPU::Initialize(); - CPU::CodeCache::Initialize(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler); - Bus::Initialize(); + + if (!Bus::Initialize()) + return false; + + CPU::CodeCache::Initialize(); if (!CreateGPU(force_software_renderer ? GPURenderer::Software : g_settings.gpu_renderer)) return false; g_dma.Initialize(); - g_interrupt_controller.Initialize(); g_cdrom.Initialize(); diff --git a/src/core/types.h b/src/core/types.h index c1a281064..8f896dad2 100644 --- a/src/core/types.h +++ b/src/core/types.h @@ -129,6 +129,6 @@ enum : u32 enum : u32 { - CPU_CODE_CACHE_PAGE_SIZE = 1024, + CPU_CODE_CACHE_PAGE_SIZE = 4096, CPU_CODE_CACHE_PAGE_COUNT = 0x200000 / CPU_CODE_CACHE_PAGE_SIZE };