From a0bafd76368dc17032947b7fabdec5532f383a84 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Wed, 2 Sep 2020 23:30:56 +1000 Subject: [PATCH 1/8] CPU/Recompiler: Optimize constant reads (and some writes) --- src/core/bus.cpp | 69 +++++++++++++++++- src/core/cpu_core_private.h | 3 + .../cpu_recompiler_code_generator_aarch64.cpp | 72 ++++++++++++++++++- .../cpu_recompiler_code_generator_x64.cpp | 30 ++++++++ 4 files changed, 169 insertions(+), 5 deletions(-) diff --git a/src/core/bus.cpp b/src/core/bus.cpp index 7b8a70c0a..f238e6b17 100644 --- a/src/core/bus.cpp +++ b/src/core/bus.cpp @@ -22,6 +22,11 @@ Log_SetChannel(Bus); namespace Bus { +enum : TickCount +{ + RAM_READ_TICKS = 4 +}; + union MEMDELAY { u32 bits; @@ -288,7 +293,7 @@ ALWAYS_INLINE static TickCount DoRAMAccess(u32 offset, u32& value) } } - return (type == MemoryAccessType::Read) ? 4 : 0; + return (type == MemoryAccessType::Read) ? RAM_READ_TICKS : 0; } template @@ -753,7 +758,7 @@ ALWAYS_INLINE_RELEASE void DoInstructionRead(PhysicalMemoryAddress address, void { std::memcpy(data, &g_ram[address & RAM_MASK], sizeof(u32) * word_count); if constexpr (add_ticks) - g_state.pending_ticks += (icache_read ? 1 : 4) * word_count; + g_state.pending_ticks += (icache_read ? 1 : RAM_READ_TICKS) * word_count; } else if (address >= BIOS_BASE && address < (BIOS_BASE + BIOS_SIZE)) { @@ -776,7 +781,7 @@ TickCount GetInstructionReadTicks(VirtualMemoryAddress address) if (address < RAM_MIRROR_END) { - return 4; + return RAM_READ_TICKS; } else if (address >= BIOS_BASE && address < (BIOS_BASE + BIOS_SIZE)) { @@ -1307,6 +1312,64 @@ bool SafeWriteMemoryWord(VirtualMemoryAddress addr, u32 value) return DoMemoryAccess(addr, value) >= 0; } +void* GetDirectReadMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize size, TickCount* read_ticks) +{ + using namespace Bus; + + const u32 seg = (address >> 29); + if (seg != 0 && seg != 4 && seg != 5) + return nullptr; + + const PhysicalMemoryAddress paddr = address & PHYSICAL_MEMORY_ADDRESS_MASK; + if (paddr < RAM_MIRROR_END) + { + if (read_ticks) + *read_ticks = RAM_READ_TICKS; + + return &g_ram[paddr & RAM_MASK]; + } + + if ((paddr & DCACHE_LOCATION_MASK) == DCACHE_LOCATION) + { + if (read_ticks) + *read_ticks = 0; + + return &g_state.dcache[paddr & DCACHE_OFFSET_MASK]; + } + + if (paddr >= BIOS_BASE && paddr < (BIOS_BASE + BIOS_SIZE)) + { + if (read_ticks) + *read_ticks = m_bios_access_time[static_cast(size)]; + + return &g_bios[paddr & BIOS_MASK]; + } + + return nullptr; +} + +void* GetDirectWriteMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize size) +{ + using namespace Bus; + + const u32 seg = (address >> 29); + if (seg != 0 && seg != 4 && seg != 5) + return nullptr; + + const PhysicalMemoryAddress paddr = address & PHYSICAL_MEMORY_ADDRESS_MASK; + +#if 0 + // Not enabled until we can protect code regions. + if (paddr < RAM_MIRROR_END) + return &g_ram[paddr & RAM_MASK]; +#endif + + if ((paddr & DCACHE_LOCATION_MASK) == DCACHE_LOCATION) + return &g_state.dcache[paddr & DCACHE_OFFSET_MASK]; + + return nullptr; +} + namespace Recompiler::Thunks { u64 ReadMemoryByte(u32 address) diff --git a/src/core/cpu_core_private.h b/src/core/cpu_core_private.h index 9f74fd7f0..05ee62f5f 100644 --- a/src/core/cpu_core_private.h +++ b/src/core/cpu_core_private.h @@ -1,5 +1,6 @@ #pragma once #include "cpu_core.h" +#include "bus.h" namespace CPU { @@ -72,5 +73,7 @@ bool ReadMemoryWord(VirtualMemoryAddress addr, u32* value); bool WriteMemoryByte(VirtualMemoryAddress addr, u8 value); bool WriteMemoryHalfWord(VirtualMemoryAddress addr, u16 value); bool WriteMemoryWord(VirtualMemoryAddress addr, u32 value); +void* GetDirectReadMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize size, TickCount* read_ticks); +void* GetDirectWriteMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize size); } // namespace CPU \ No newline at end of file diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 4a62184ab..7d772c099 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -1283,6 +1283,23 @@ void CodeGenerator::EmitAddCPUStructField(u32 offset, const Value& value) Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, RegSize size) { + if (address.IsConstant()) + { + TickCount read_ticks; + void* ptr = GetDirectReadMemoryPointer( + static_cast(address.constant_value), + (size == RegSize_8) ? MemoryAccessSize::Byte : + ((size == RegSize_16) ? MemoryAccessSize::HalfWord : MemoryAccessSize::Word), + &read_ticks); + if (ptr) + { + Value result = m_register_cache.AllocateScratch(size); + EmitLoadGlobal(result.GetHostRegister(), size, ptr); + m_delayed_cycles_add += read_ticks; + return result; + } + } + AddPendingCycles(true); if (g_settings.cpu_recompiler_memory_exceptions) @@ -1405,6 +1422,19 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const Value& address, const Value& value) { + if (address.IsConstant()) + { + void* ptr = GetDirectWriteMemoryPointer( + static_cast(address.constant_value), + (value.size == RegSize_8) ? MemoryAccessSize::Byte : + ((value.size == RegSize_16) ? MemoryAccessSize::HalfWord : MemoryAccessSize::Word)); + if (ptr) + { + EmitStoreGlobal(ptr, value); + return; + } + } + AddPendingCycles(true); if (g_settings.cpu_recompiler_memory_exceptions) @@ -1480,12 +1510,50 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { - Panic("Not implemented"); + m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); + switch (size) + { + case RegSize_8: + m_emit->Ldrb(GetHostReg8(host_reg), a64::MemOperand(GetHostReg64(RSCRATCH))); + break; + + case RegSize_16: + m_emit->Ldrh(GetHostReg16(host_reg), a64::MemOperand(GetHostReg64(RSCRATCH))); + break; + + case RegSize_32: + m_emit->Ldr(GetHostReg32(host_reg), a64::MemOperand(GetHostReg64(RSCRATCH))); + break; + + default: + UnreachableCode(); + break; + } } void CodeGenerator::EmitStoreGlobal(void* ptr, const Value& value) { - Panic("Not implemented"); + Value value_in_hr = GetValueInHostRegister(value); + + m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); + switch (value.size) + { + case RegSize_8: + m_emit->Strb(GetHostReg8(value_in_hr), a64::MemOperand(GetHostReg64(RSCRATCH))); + break; + + case RegSize_16: + m_emit->Strh(GetHostReg16(value_in_hr), a64::MemOperand(GetHostReg64(RSCRATCH))); + break; + + case RegSize_32: + m_emit->Str(GetHostReg32(value_in_hr), a64::MemOperand(GetHostReg64(RSCRATCH))); + break; + + default: + UnreachableCode(); + break; + } } void CodeGenerator::EmitFlushInterpreterLoadDelay() diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index fd2f34035..142f86fad 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -1738,6 +1738,23 @@ void CodeGenerator::EmitAddCPUStructField(u32 offset, const Value& value) Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, RegSize size) { + if (address.IsConstant()) + { + TickCount read_ticks; + void* ptr = GetDirectReadMemoryPointer( + static_cast(address.constant_value), + (size == RegSize_8) ? MemoryAccessSize::Byte : + ((size == RegSize_16) ? MemoryAccessSize::HalfWord : MemoryAccessSize::Word), + &read_ticks); + if (ptr) + { + Value result = m_register_cache.AllocateScratch(size); + EmitLoadGlobal(result.GetHostRegister(), size, ptr); + m_delayed_cycles_add += read_ticks; + return result; + } + } + AddPendingCycles(true); if (g_settings.cpu_recompiler_memory_exceptions) @@ -1858,6 +1875,19 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const Value& address, const Value& value) { + if (address.IsConstant()) + { + void* ptr = GetDirectWriteMemoryPointer( + static_cast(address.constant_value), + (value.size == RegSize_8) ? MemoryAccessSize::Byte : + ((value.size == RegSize_16) ? MemoryAccessSize::HalfWord : MemoryAccessSize::Word)); + if (ptr) + { + EmitStoreGlobal(ptr, value); + return; + } + } + AddPendingCycles(true); if (g_settings.cpu_recompiler_memory_exceptions) From 5099d0e62fe753b91cc60e8fcbc8f3daa0a7591b Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Thu, 3 Sep 2020 23:49:11 +1000 Subject: [PATCH 2/8] CPU: Move interrupt check out of inner-most exec loop --- src/core/cpu_code_cache.cpp | 28 +++++++++---------- src/core/cpu_core.cpp | 31 ++++++++++++++++++++-- src/core/cpu_core_private.h | 27 +++++-------------- src/core/cpu_recompiler_code_generator.cpp | 25 ++++++++--------- 4 files changed, 61 insertions(+), 50 deletions(-) diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 727f449d8..f6bf291f2 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -132,18 +132,17 @@ static void ExecuteImpl() g_state.frame_done = false; while (!g_state.frame_done) { + if (HasPendingInterrupt()) + { + SafeReadInstruction(g_state.regs.pc, &g_state.next_instruction.bits); + DispatchInterrupt(); + } + TimingEvents::UpdateCPUDowncount(); next_block_key = GetNextBlockKey(); while (g_state.pending_ticks < g_state.downcount) { - if (HasPendingInterrupt()) - { - SafeReadInstruction(g_state.regs.pc, &g_state.next_instruction.bits); - DispatchInterrupt(); - next_block_key = GetNextBlockKey(); - } - CodeBlock* block = LookupBlock(next_block_key); if (!block) { @@ -153,6 +152,7 @@ static void ExecuteImpl() } reexecute_block: + Assert(!(HasPendingInterrupt())); #if 0 const u32 tick = TimingEvents::GetGlobalTickCounter() + CPU::GetPendingTicks(); @@ -171,7 +171,7 @@ static void ExecuteImpl() if (g_state.pending_ticks >= g_state.downcount) break; - else if (HasPendingInterrupt() || !USE_BLOCK_LINKING) + else if (!USE_BLOCK_LINKING) continue; next_block_key = GetNextBlockKey(); @@ -243,16 +243,16 @@ void ExecuteRecompiler() g_state.frame_done = false; while (!g_state.frame_done) { + if (HasPendingInterrupt()) + { + SafeReadInstruction(g_state.regs.pc, &g_state.next_instruction.bits); + DispatchInterrupt(); + } + TimingEvents::UpdateCPUDowncount(); while (g_state.pending_ticks < g_state.downcount) { - if (HasPendingInterrupt()) - { - SafeReadInstruction(g_state.regs.pc, &g_state.next_instruction.bits); - DispatchInterrupt(); - } - const u32 pc = g_state.regs.pc; g_state.current_instruction_pc = pc; const u32 fast_map_index = GetFastMapIndex(pc); diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 30d7e1e3f..463d64b5a 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -229,7 +229,16 @@ void RaiseException(u32 CAUSE_bits, u32 EPC) void SetExternalInterrupt(u8 bit) { g_state.cop0_regs.cause.Ip |= static_cast(1u << bit); - g_state.interrupt_delay = 1; + + if (g_settings.cpu_execution_mode == CPUExecutionMode::Interpreter) + { + g_state.interrupt_delay = 1; + } + else + { + g_state.interrupt_delay = 0; + CheckForPendingInterrupt(); + } } void ClearExternalInterrupt(u8 bit) @@ -395,6 +404,7 @@ ALWAYS_INLINE_RELEASE static void WriteCop0Reg(Cop0Reg reg, u32 value) g_state.cop0_regs.sr.bits = (g_state.cop0_regs.sr.bits & ~Cop0Registers::SR::WRITE_MASK) | (value & Cop0Registers::SR::WRITE_MASK); Log_DebugPrintf("COP0 SR <- %08X (now %08X)", value, g_state.cop0_regs.sr.bits); + CheckForPendingInterrupt(); } break; @@ -403,6 +413,7 @@ ALWAYS_INLINE_RELEASE static void WriteCop0Reg(Cop0Reg reg, u32 value) g_state.cop0_regs.cause.bits = (g_state.cop0_regs.cause.bits & ~Cop0Registers::CAUSE::WRITE_MASK) | (value & Cop0Registers::CAUSE::WRITE_MASK); Log_DebugPrintf("COP0 CAUSE <- %08X (now %08X)", value, g_state.cop0_regs.cause.bits); + CheckForPendingInterrupt(); } break; @@ -1216,6 +1227,7 @@ restart_instruction: // restore mode g_state.cop0_regs.sr.mode_bits = (g_state.cop0_regs.sr.mode_bits & UINT32_C(0b110000)) | (g_state.cop0_regs.sr.mode_bits >> 2); + CheckForPendingInterrupt(); } break; @@ -1365,6 +1377,20 @@ restart_instruction: } } +void DispatchInterrupt() +{ + // If the instruction we're about to execute is a GTE instruction, delay dispatching the interrupt until the next + // instruction. For some reason, if we don't do this, we end up with incorrectly sorted polygons and flickering.. + if (g_state.next_instruction.op == InstructionOp::cop2 && !g_state.next_instruction.cop.IsCommonInstruction()) + GTE::ExecuteInstruction(g_state.next_instruction.bits); + + // Interrupt raising occurs before the start of the instruction. + RaiseException( + Cop0Registers::CAUSE::MakeValueForException(Exception::INT, g_state.next_instruction_is_branch_delay_slot, + g_state.branch_was_taken, g_state.next_instruction.cop.cop_n), + g_state.regs.pc); +} + template static void ExecuteImpl() { @@ -1375,9 +1401,10 @@ static void ExecuteImpl() while (g_state.pending_ticks < g_state.downcount) { - if (HasPendingInterrupt()) + if (HasPendingInterrupt() && !g_state.interrupt_delay) DispatchInterrupt(); + g_state.interrupt_delay = false; g_state.pending_ticks++; // now executing the instruction we previously fetched diff --git a/src/core/cpu_core_private.h b/src/core/cpu_core_private.h index 05ee62f5f..caff0de60 100644 --- a/src/core/cpu_core_private.h +++ b/src/core/cpu_core_private.h @@ -8,33 +8,20 @@ namespace CPU { void RaiseException(Exception excode); void RaiseException(u32 CAUSE_bits, u32 EPC); -ALWAYS_INLINE static bool HasPendingInterrupt() +ALWAYS_INLINE bool HasPendingInterrupt() { - // const bool do_interrupt = g_state.m_cop0_regs.sr.IEc && ((g_state.m_cop0_regs.cause.Ip & g_state.m_cop0_regs.sr.Im) - // != 0); - const bool do_interrupt = g_state.cop0_regs.sr.IEc && + return g_state.cop0_regs.sr.IEc && (((g_state.cop0_regs.cause.bits & g_state.cop0_regs.sr.bits) & (UINT32_C(0xFF) << 8)) != 0); - - const bool interrupt_delay = g_state.interrupt_delay; - g_state.interrupt_delay = false; - - return do_interrupt && !interrupt_delay; } -ALWAYS_INLINE static void DispatchInterrupt() +ALWAYS_INLINE void CheckForPendingInterrupt() { - // If the instruction we're about to execute is a GTE instruction, delay dispatching the interrupt until the next - // instruction. For some reason, if we don't do this, we end up with incorrectly sorted polygons and flickering.. - if (g_state.next_instruction.IsCop2Instruction()) - return; - - // Interrupt raising occurs before the start of the instruction. - RaiseException( - Cop0Registers::CAUSE::MakeValueForException(Exception::INT, g_state.next_instruction_is_branch_delay_slot, - g_state.branch_was_taken, g_state.next_instruction.cop.cop_n), - g_state.regs.pc); + if (HasPendingInterrupt()) + g_state.downcount = 0; } +void DispatchInterrupt(); + // icache stuff ALWAYS_INLINE bool IsCachedAddress(VirtualMemoryAddress address) { diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index f350844a3..912957bc7 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -1913,21 +1913,8 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) EmitBranchIfBitClear(sr_value.host_reg, sr_value.size, 0, &no_interrupt); EmitAnd(sr_value.host_reg, sr_value.host_reg, cause_value); EmitTest(sr_value.host_reg, Value::FromConstantU32(0xFF00)); - sr_value.ReleaseAndClear(); - cause_value.ReleaseAndClear(); EmitConditionalBranch(Condition::Zero, false, &no_interrupt); - - EmitBranch(GetCurrentFarCodePointer()); - SwitchToFarCode(); - - // we want to flush pc here - m_register_cache.PushState(); - m_register_cache.FlushAllGuestRegisters(false, true); - WriteNewPC(CalculatePC(), false); - EmitExceptionExit(); - m_register_cache.PopState(); - - SwitchToNearCode(); + EmitStoreCPUStructField(offsetof(State, downcount), Value::FromConstantU32(0)); EmitBindLabel(&no_interrupt); } @@ -1962,6 +1949,16 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) EmitStoreCPUStructField(offsetof(State, cop0_regs.sr.bits), sr); + Value cause_value = m_register_cache.AllocateScratch(RegSize_32); + EmitLoadCPUStructField(cause_value.host_reg, cause_value.size, offsetof(State, cop0_regs.cause.bits)); + + LabelType no_interrupt; + EmitAnd(sr.host_reg, sr.host_reg, cause_value); + EmitTest(sr.host_reg, Value::FromConstantU32(0xFF00)); + EmitConditionalBranch(Condition::Zero, false, &no_interrupt); + EmitStoreCPUStructField(offsetof(State, downcount), Value::FromConstantU32(0)); + EmitBindLabel(&no_interrupt); + InstructionEpilogue(cbi); return true; } From b90dbf34c458b22da8387046851f0d1a903f694a Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 6 Sep 2020 00:17:21 +1000 Subject: [PATCH 3/8] WIP: ASM dispatcher for recompiler --- src/core/bus.h | 13 -- src/core/cpu_code_cache.cpp | 26 ++- src/core/cpu_code_cache.h | 21 +++ src/core/cpu_core.cpp | 1 + src/core/cpu_recompiler_code_generator.h | 3 + .../cpu_recompiler_code_generator_aarch64.cpp | 162 ++++++++++++++++-- .../cpu_recompiler_code_generator_x64.cpp | 121 ++++++++++++- src/core/cpu_recompiler_register_cache.cpp | 27 +++ src/core/cpu_recompiler_register_cache.h | 6 + src/core/dma.cpp | 5 +- src/core/timing_event.cpp | 5 + src/core/timing_event.h | 2 + 12 files changed, 354 insertions(+), 38 deletions(-) diff --git a/src/core/bus.h b/src/core/bus.h index 10c44f90e..d2f187ba6 100644 --- a/src/core/bus.h +++ b/src/core/bus.h @@ -1,6 +1,5 @@ #pragma once #include "common/bitfield.h" -#include "cpu_code_cache.h" #include "types.h" #include #include @@ -97,16 +96,4 @@ ALWAYS_INLINE TickCount GetDMARAMTickCount(u32 word_count) return static_cast(word_count + ((word_count + 15) / 16)); } -/// Invalidates any code pages which overlap the specified range. -ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_count) -{ - const u32 start_page = address / CPU_CODE_CACHE_PAGE_SIZE; - const u32 end_page = (address + word_count * sizeof(u32)) / CPU_CODE_CACHE_PAGE_SIZE; - for (u32 page = start_page; page <= end_page; page++) - { - if (m_ram_code_bits[page]) - CPU::CodeCache::InvalidateBlocksWithPageIndex(page); - } -} - } // namespace Bus diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index f6bf291f2..e4a1a55e5 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -35,14 +35,8 @@ alignas(Recompiler::CODE_STORAGE_ALIGNMENT) static u8 static JitCodeBuffer s_code_buffer; -enum : u32 -{ - FAST_MAP_RAM_SLOT_COUNT = Bus::RAM_SIZE / 4, - FAST_MAP_BIOS_SLOT_COUNT = Bus::BIOS_SIZE / 4, - FAST_MAP_TOTAL_SLOT_COUNT = FAST_MAP_RAM_SLOT_COUNT + FAST_MAP_BIOS_SLOT_COUNT, -}; - std::array s_fast_map; +CodeBlock::HostCodePointer s_asm_dispatcher; ALWAYS_INLINE static u32 GetFastMapIndex(u32 pc) { @@ -51,6 +45,7 @@ ALWAYS_INLINE static u32 GetFastMapIndex(u32 pc) ((pc & Bus::RAM_MASK) >> 2); } +static void CompileDispatcher(); static void FastCompileBlockFunction(); static void ResetFastMap() @@ -111,6 +106,7 @@ void Initialize(bool use_recompiler) } ResetFastMap(); + CompileDispatcher(); #else s_use_recompiler = false; #endif @@ -238,9 +234,21 @@ void Execute() #ifdef WITH_RECOMPILER +void CompileDispatcher() +{ + Recompiler::CodeGenerator cg(&s_code_buffer); + s_asm_dispatcher = cg.CompileDispatcher(); +} + +CodeBlock::HostCodePointer* GetFastMapPointer() +{ + return s_fast_map.data(); +} + void ExecuteRecompiler() { g_state.frame_done = false; +#if 0 while (!g_state.frame_done) { if (HasPendingInterrupt()) @@ -261,6 +269,9 @@ void ExecuteRecompiler() TimingEvents::RunEvents(); } +#else + s_asm_dispatcher(); +#endif // in case we switch to interpreter... g_state.regs.npc = g_state.regs.pc; @@ -291,6 +302,7 @@ void Flush() #ifdef WITH_RECOMPILER s_code_buffer.Reset(); ResetFastMap(); + CompileDispatcher(); #endif } diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h index 068e6706e..5d285191b 100644 --- a/src/core/cpu_code_cache.h +++ b/src/core/cpu_code_cache.h @@ -1,4 +1,5 @@ #pragma once +#include "bus.h" #include "common/bitfield.h" #include "common/jit_code_buffer.h" #include "cpu_types.h" @@ -9,6 +10,13 @@ namespace CPU { +enum : u32 +{ + FAST_MAP_RAM_SLOT_COUNT = Bus::RAM_SIZE / 4, + FAST_MAP_BIOS_SLOT_COUNT = Bus::BIOS_SIZE / 4, + FAST_MAP_TOTAL_SLOT_COUNT = FAST_MAP_RAM_SLOT_COUNT + FAST_MAP_BIOS_SLOT_COUNT, +}; + union CodeBlockKey { u32 bits; @@ -86,6 +94,7 @@ void Shutdown(); void Execute(); #ifdef WITH_RECOMPILER +CodeBlock::HostCodePointer* GetFastMapPointer(); void ExecuteRecompiler(); #endif @@ -102,6 +111,18 @@ template void InterpretCachedBlock(const CodeBlock& block); void InterpretUncachedBlock(); +/// Invalidates any code pages which overlap the specified range. +ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_count) +{ + const u32 start_page = address / CPU_CODE_CACHE_PAGE_SIZE; + const u32 end_page = (address + word_count * sizeof(u32)) / CPU_CODE_CACHE_PAGE_SIZE; + for (u32 page = start_page; page <= end_page; page++) + { + if (Bus::m_ram_code_bits[page]) + CPU::CodeCache::InvalidateBlocksWithPageIndex(page); + } +} + }; // namespace CodeCache } // namespace CPU diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 463d64b5a..0d88e465b 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -1381,6 +1381,7 @@ void DispatchInterrupt() { // If the instruction we're about to execute is a GTE instruction, delay dispatching the interrupt until the next // instruction. For some reason, if we don't do this, we end up with incorrectly sorted polygons and flickering.. + SafeReadInstruction(g_state.regs.pc, &g_state.next_instruction.bits); if (g_state.next_instruction.op == InstructionOp::cop2 && !g_state.next_instruction.cop.IsCommonInstruction()) GTE::ExecuteInstruction(g_state.next_instruction.bits); diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index 438786bd3..699c50d99 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -25,6 +25,8 @@ public: bool CompileBlock(const CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); + CodeBlock::HostCodePointer CompileDispatcher(); + ////////////////////////////////////////////////////////////////////////// // Code Generation ////////////////////////////////////////////////////////////////////////// @@ -67,6 +69,7 @@ public: void EmitAddCPUStructField(u32 offset, const Value& value); void EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr); void EmitStoreGlobal(void* ptr, const Value& value); + void EmitLoadGlobalAddress(HostReg host_reg, const void* ptr); // Automatically generates an exception handler. Value EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, RegSize size); diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 7d772c099..c262e35e5 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -6,6 +6,7 @@ #include "cpu_recompiler_code_generator.h" #include "cpu_recompiler_thunks.h" #include "settings.h" +#include "timing_event.h" Log_SetChannel(CPU::Recompiler); namespace a64 = vixl::aarch64; @@ -26,6 +27,16 @@ constexpr u64 FUNCTION_CALLER_SAVED_SPACE_RESERVE = 144; // 18 registers -> 224 constexpr u64 FUNCTION_STACK_SIZE = FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE + FUNCTION_CALL_SHADOW_SPACE; +// PC we return to after the end of the block +static void* s_dispatcher_return_address; + +static s64 GetPCDisplacement(const void* current, const void* target) +{ + Assert(Common::IsAlignedPow2(reinterpret_cast(current), 4)); + Assert(Common::IsAlignedPow2(reinterpret_cast(target), 4)); + return static_cast((reinterpret_cast(target) - reinterpret_cast(current)) >> 2); +} + static const a64::WRegister GetHostReg8(HostReg reg) { return a64::WRegister(reg); @@ -172,11 +183,11 @@ void CodeGenerator::EmitBeginBlock() // Save the link register, since we'll be calling functions. const bool link_reg_allocated = m_register_cache.AllocateHostReg(30); DebugAssert(link_reg_allocated); + m_register_cache.AssumeCalleeSavedRegistersAreSaved(); // Store the CPU struct pointer. TODO: make this better. const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); - m_emit->Mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); } void CodeGenerator::EmitEndBlock() @@ -185,6 +196,7 @@ void CodeGenerator::EmitEndBlock() m_register_cache.PopCalleeSavedRegisters(true); m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + // m_emit->b(GetPCDisplacement(GetCurrentCodePointer(), s_dispatcher_return_address)); m_emit->Ret(); } @@ -200,6 +212,7 @@ void CodeGenerator::EmitExceptionExit() m_register_cache.PopCalleeSavedRegisters(false); m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + // m_emit->b(GetPCDisplacement(GetCurrentCodePointer(), s_dispatcher_return_address)); m_emit->Ret(); } @@ -958,13 +971,6 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size) m_register_cache.PopCallerSavedRegisters(); } -static s64 GetBranchDisplacement(const void* current, const void* target) -{ - Assert(Common::IsAlignedPow2(reinterpret_cast(current), 4)); - Assert(Common::IsAlignedPow2(reinterpret_cast(target), 4)); - return static_cast((reinterpret_cast(target) - reinterpret_cast(current)) >> 2); -} - void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) { if (return_value) @@ -974,7 +980,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) const u32 adjust_size = PrepareStackForCall(); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1009,7 +1015,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG1, arg1); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1045,7 +1051,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG2, arg2); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1083,7 +1089,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG3, arg3); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1122,7 +1128,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG4, arg4); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1510,7 +1516,7 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); + EmitLoadGlobalAddress(RSCRATCH, ptr); switch (size) { case RegSize_8: @@ -1535,7 +1541,7 @@ void CodeGenerator::EmitStoreGlobal(void* ptr, const Value& value) { Value value_in_hr = GetValueInHostRegister(value); - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); + EmitLoadGlobalAddress(RSCRATCH, ptr); switch (value.size) { case RegSize_8: @@ -1882,4 +1888,130 @@ void CodeGenerator::EmitBindLabel(LabelType* label) m_emit->Bind(label); } +void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr) +{ + const void* current_code_ptr_page = reinterpret_cast( + reinterpret_cast(GetCurrentCodePointer()) & ~static_cast(0xFFF)); + const void* ptr_page = + reinterpret_cast(reinterpret_cast(ptr) & ~static_cast(0xFFF)); + const s64 page_displacement = GetPCDisplacement(current_code_ptr_page, ptr_page) >> 10; + const u32 page_offset = static_cast(reinterpret_cast(ptr) & 0xFFFu); + if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmLogical(page_offset, 64)) + { + m_emit->adrp(GetHostReg64(host_reg), page_displacement); + m_emit->orr(GetHostReg64(host_reg), GetHostReg64(host_reg), page_offset); + } + else + { + m_emit->Mov(GetHostReg64(host_reg), reinterpret_cast(ptr)); + } +} + +CodeBlock::HostCodePointer CodeGenerator::CompileDispatcher() +{ + m_emit->Sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + m_register_cache.ReserveCallerSavedRegisters(); + + EmitLoadGlobalAddress(RCPUPTR, &g_state); + + a64::Label frame_done_loop; + a64::Label exit_dispatcher; + m_emit->Bind(&frame_done_loop); + + // if frame_done goto exit_dispatcher + m_emit->ldrb(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, frame_done))); + m_emit->tbnz(a64::w8, 0, &exit_dispatcher); + + // x8 <- sr + a64::Label no_interrupt; + m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, cop0_regs.sr.bits))); + + // if Iec == 0 then goto no_interrupt + m_emit->tbz(a64::w8, 0, &no_interrupt); + + // x9 <- cause + // x8 (sr) & cause + m_emit->ldr(a64::w9, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, cop0_regs.cause.bits))); + m_emit->and_(a64::w8, a64::w8, a64::w9); + + // ((sr & cause) & 0xff00) == 0 goto no_interrupt + m_emit->tst(a64::w8, 0xFF00); + m_emit->b(&no_interrupt, a64::eq); + + // we have an interrupt + EmitFunctionCall(nullptr, &DispatchInterrupt); + + // no interrupt or we just serviced it + m_emit->Bind(&no_interrupt); + + // TimingEvents::UpdateCPUDowncount: + // x8 <- head event->downcount + // downcount <- x8 + EmitLoadGlobalAddress(8, TimingEvents::GetHeadEventPtr()); + m_emit->ldr(a64::x8, a64::MemOperand(a64::x8)); + m_emit->ldr(a64::w8, a64::MemOperand(a64::x8, offsetof(TimingEvent, m_downcount))); + m_emit->str(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, downcount))); + + // main dispatch loop + a64::Label main_loop; + m_emit->Bind(&main_loop); + s_dispatcher_return_address = GetCurrentCodePointer(); + + // w8 <- pending_ticks + // w9 <- downcount + m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, pending_ticks))); + m_emit->ldr(a64::w9, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, downcount))); + + // while downcount < pending_ticks + a64::Label downcount_hit; + m_emit->cmp(a64::w8, a64::w9); + m_emit->b(&downcount_hit, a64::ge); + + // time to lookup the block + // w8 <- pc + m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, regs.pc))); + + // w9 <- (pc & RAM_MASK) >> 2 + m_emit->and_(a64::w9, a64::w8, Bus::RAM_MASK); + m_emit->lsr(a64::w9, a64::w9, 2); + + // w10 <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT + m_emit->and_(a64::w10, a64::w8, Bus::BIOS_MASK); + m_emit->lsr(a64::w10, a64::w10, 2); + m_emit->add(a64::w10, a64::w10, FAST_MAP_RAM_SLOT_COUNT); + + // current_instruction_pc <- pc (eax) + m_emit->str(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, current_instruction_pc))); + + // if ((w8 (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use w10 as index } + m_emit->and_(a64::w8, a64::w8, PHYSICAL_MEMORY_ADDRESS_MASK); + m_emit->Mov(a64::w11, Bus::BIOS_BASE); + m_emit->cmp(a64::w8, a64::w11); + m_emit->csel(a64::w8, a64::w9, a64::w10, a64::lt); + + // ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue + EmitLoadGlobalAddress(9, CodeCache::GetFastMapPointer()); + m_emit->ldr(a64::x8, a64::MemOperand(a64::x9, a64::x8, a64::LSL, 3)); + m_emit->blr(a64::x8); + + // end while + m_emit->Bind(&downcount_hit); + + // check events then for frame done + EmitFunctionCall(nullptr, &TimingEvents::RunEvents); + m_emit->b(&frame_done_loop); + + // all done + m_emit->Bind(&exit_dispatcher); + m_register_cache.PopCalleeSavedRegisters(true); + m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + m_emit->Ret(); + + CodeBlock::HostCodePointer ptr; + u32 code_size; + FinalizeBlock(&ptr, &code_size); + Log_InfoPrintf("Dispatcher is %u bytes at %p", code_size, ptr); + return ptr; +} + } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 142f86fad..82fc81cee 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -1,9 +1,12 @@ #include "common/align.h" +#include "common/log.h" #include "cpu_core.h" #include "cpu_core_private.h" #include "cpu_recompiler_code_generator.h" #include "cpu_recompiler_thunks.h" #include "settings.h" +#include "timing_event.h" +Log_SetChannel(Recompiler::CodeGenerator); namespace CPU::Recompiler { @@ -187,10 +190,12 @@ Value CodeGenerator::GetValueInHostRegister(const Value& value, bool allow_zero_ void CodeGenerator::EmitBeginBlock() { + m_register_cache.AssumeCalleeSavedRegistersAreSaved(); + // Store the CPU struct pointer. const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); - m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); + // m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); } void CodeGenerator::EmitEndBlock() @@ -2516,4 +2521,118 @@ void CodeGenerator::EmitBindLabel(LabelType* label) m_emit->L(*label); } +void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr) +{ + const s64 displacement = + static_cast(reinterpret_cast(ptr) - reinterpret_cast(m_emit->getCurr())) + 2; + if (Xbyak::inner::IsInInt32(static_cast(displacement))) + m_emit->lea(GetHostReg64(host_reg), m_emit->dword[m_emit->rip + ptr]); + else + m_emit->mov(GetHostReg64(host_reg), reinterpret_cast(ptr)); +} + +CodeBlock::HostCodePointer CodeGenerator::CompileDispatcher() +{ + m_register_cache.ReserveCallerSavedRegisters(); + + EmitLoadGlobalAddress(Xbyak::Operand::RBP, &g_state); + + Xbyak::Label frame_done_loop; + Xbyak::Label exit_dispatcher; + m_emit->L(frame_done_loop); + + // if frame_done goto exit_dispatcher + m_emit->test(m_emit->byte[m_emit->rbp + offsetof(State, frame_done)], 1); + m_emit->jnz(exit_dispatcher, Xbyak::CodeGenerator::T_NEAR); + + // eax <- sr + Xbyak::Label no_interrupt; + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, cop0_regs.sr.bits)]); + + // if Iec == 0 then goto no_interrupt + m_emit->test(m_emit->eax, 1); + m_emit->jz(no_interrupt); + + // sr & cause + m_emit->and_(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, cop0_regs.cause.bits)]); + + // ((sr & cause) & 0xff00) == 0 goto no_interrupt + m_emit->test(m_emit->eax, 0xFF00); + m_emit->jz(no_interrupt); + + // we have an interrupt + EmitFunctionCall(nullptr, &DispatchInterrupt); + + // no interrupt or we just serviced it + m_emit->L(no_interrupt); + + // TimingEvents::UpdateCPUDowncount: + // eax <- head event->downcount + // downcount <- eax + EmitLoadGlobalAddress(Xbyak::Operand::RAX, TimingEvents::GetHeadEventPtr()); + m_emit->mov(m_emit->rax, m_emit->qword[m_emit->rax]); + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rax + offsetof(TimingEvent, m_downcount)]); + m_emit->mov(m_emit->dword[m_emit->rbp + offsetof(State, downcount)], m_emit->eax); + + // main dispatch loop + Xbyak::Label main_loop; + m_emit->align(16); + m_emit->L(main_loop); + + // eax <- pending_ticks + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, pending_ticks)]); + + // while eax < downcount + Xbyak::Label downcount_hit; + m_emit->cmp(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, downcount)]); + m_emit->jge(downcount_hit); + + // time to lookup the block + // eax <- pc + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, regs.pc)]); + + // ebx <- (pc & RAM_MASK) >> 2 + m_emit->mov(m_emit->ebx, m_emit->eax); + m_emit->and_(m_emit->ebx, Bus::RAM_MASK); + m_emit->shr(m_emit->ebx, 2); + + // ecx <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT + m_emit->mov(m_emit->ecx, m_emit->eax); + m_emit->and_(m_emit->ecx, Bus::BIOS_MASK); + m_emit->shr(m_emit->ecx, 2); + m_emit->add(m_emit->ecx, FAST_MAP_RAM_SLOT_COUNT); + + // current_instruction_pc <- pc (eax) + m_emit->mov(m_emit->dword[m_emit->rbp + offsetof(State, current_instruction_pc)], m_emit->eax); + + // if ((eax (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use ecx as index } + m_emit->and_(m_emit->eax, PHYSICAL_MEMORY_ADDRESS_MASK); + m_emit->cmp(m_emit->eax, Bus::BIOS_BASE); + m_emit->cmovge(m_emit->ebx, m_emit->ecx); + + // ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue + EmitLoadGlobalAddress(Xbyak::Operand::RAX, CodeCache::GetFastMapPointer()); + m_emit->mov(m_emit->rax, m_emit->qword[m_emit->rax + m_emit->rbx * 8]); + m_emit->call(m_emit->rax); + m_emit->jmp(main_loop); + + // end while + m_emit->L(downcount_hit); + + // check events then for frame done + EmitFunctionCall(nullptr, &TimingEvents::RunEvents); + m_emit->jmp(frame_done_loop); + + // all done + m_emit->L(exit_dispatcher); + m_register_cache.PopCalleeSavedRegisters(true); + m_emit->ret(); + + CodeBlock::HostCodePointer ptr; + u32 code_size; + FinalizeBlock(&ptr, &code_size); + Log_InfoPrintf("Dispatcher is %u bytes at %p", code_size, ptr); + return ptr; +} + } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_register_cache.cpp b/src/core/cpu_recompiler_register_cache.cpp index dac853c81..91738eead 100644 --- a/src/core/cpu_recompiler_register_cache.cpp +++ b/src/core/cpu_recompiler_register_cache.cpp @@ -351,6 +351,33 @@ u32 RegisterCache::PopCalleeSavedRegisters(bool commit) return count; } +void RegisterCache::ReserveCallerSavedRegisters() +{ + for (u32 reg = 0; reg < HostReg_Count; reg++) + { + if ((m_state.host_reg_state[reg] & (HostRegState::CalleeSaved | HostRegState::CalleeSavedAllocated)) == + HostRegState::CalleeSaved) + { + DebugAssert(m_state.callee_saved_order_count < HostReg_Count); + m_code_generator.EmitPushHostReg(static_cast(reg), GetActiveCalleeSavedRegisterCount()); + m_state.callee_saved_order[m_state.callee_saved_order_count++] = static_cast(reg); + m_state.host_reg_state[reg] |= HostRegState::CalleeSavedAllocated; + } + } +} + +void RegisterCache::AssumeCalleeSavedRegistersAreSaved() +{ + for (u32 i = 0; i < HostReg_Count; i++) + { + if ((m_state.host_reg_state[i] & (HostRegState::CalleeSaved | HostRegState::CalleeSavedAllocated)) == + HostRegState::CalleeSaved) + { + m_state.host_reg_state[i] &= ~HostRegState::CalleeSaved; + } + } +} + void RegisterCache::PushState() { // need to copy this manually because of the load delay values diff --git a/src/core/cpu_recompiler_register_cache.h b/src/core/cpu_recompiler_register_cache.h index 0c989f296..b1092bef7 100644 --- a/src/core/cpu_recompiler_register_cache.h +++ b/src/core/cpu_recompiler_register_cache.h @@ -248,6 +248,12 @@ public: /// Restore callee-saved registers. Call at the end of the function. u32 PopCalleeSavedRegisters(bool commit); + /// Preallocates caller saved registers, enabling later use without stack pushes. + void ReserveCallerSavedRegisters(); + + /// Removes the callee-saved register flag from all registers. Call when compiling code blocks. + void AssumeCalleeSavedRegistersAreSaved(); + /// Pushes the register allocator state, use when entering branched code. void PushState(); diff --git a/src/core/dma.cpp b/src/core/dma.cpp index d685f5158..e4168e88b 100644 --- a/src/core/dma.cpp +++ b/src/core/dma.cpp @@ -4,6 +4,7 @@ #include "common/log.h" #include "common/state_wrapper.h" #include "common/string_util.h" +#include "cpu_code_cache.h" #include "cpu_core.h" #include "gpu.h" #include "interrupt_controller.h" @@ -499,7 +500,7 @@ TickCount DMA::TransferDeviceToMemory(Channel channel, u32 address, u32 incremen const u32 terminator = UINT32_C(0xFFFFFF); std::memcpy(&ram_pointer[address], &terminator, sizeof(terminator)); - Bus::InvalidateCodePages(address, word_count); + CPU::CodeCache::InvalidateCodePages(address, word_count); return Bus::GetDMARAMTickCount(word_count); } @@ -547,6 +548,6 @@ TickCount DMA::TransferDeviceToMemory(Channel channel, u32 address, u32 incremen } } - Bus::InvalidateCodePages(address, word_count); + CPU::CodeCache::InvalidateCodePages(address, word_count); return Bus::GetDMARAMTickCount(word_count); } diff --git a/src/core/timing_event.cpp b/src/core/timing_event.cpp index 8197eff80..4d1595f85 100644 --- a/src/core/timing_event.cpp +++ b/src/core/timing_event.cpp @@ -53,6 +53,11 @@ void UpdateCPUDowncount() CPU::g_state.downcount = s_active_events_head->GetDowncount(); } +TimingEvent** GetHeadEventPtr() +{ + return &s_active_events_head; +} + static void SortEvent(TimingEvent* event) { const TickCount event_downcount = event->m_downcount; diff --git a/src/core/timing_event.h b/src/core/timing_event.h index ca58ddbdf..0e012a1d7 100644 --- a/src/core/timing_event.h +++ b/src/core/timing_event.h @@ -88,6 +88,8 @@ void RunEvents(); void UpdateCPUDowncount(); +TimingEvent** GetHeadEventPtr(); + } // namespace TimingEventManager \ No newline at end of file From 401fc006eab2a7304d99777151f8c2ea8eae65f0 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 6 Sep 2020 00:26:24 +1000 Subject: [PATCH 4/8] CPU/Recompiler: Skip tracking some more interpreter state --- src/core/cpu_recompiler_code_generator.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index 912957bc7..b3f1bf4cb 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -845,8 +845,8 @@ void CodeGenerator::BlockPrologue() // we don't know the state of the last block, so assume load delays might be in progress // TODO: Pull load delay into register cache - m_current_instruction_in_branch_delay_slot_dirty = true; - m_branch_was_taken_dirty = true; + m_current_instruction_in_branch_delay_slot_dirty = g_settings.cpu_recompiler_memory_exceptions; + m_branch_was_taken_dirty = g_settings.cpu_recompiler_memory_exceptions; m_current_instruction_was_branch_taken_dirty = false; m_load_delay_dirty = true; @@ -909,7 +909,7 @@ void CodeGenerator::InstructionPrologue(const CodeBlockInstruction& cbi, TickCou return; } - if (cbi.is_branch_delay_slot) + if (cbi.is_branch_delay_slot && g_settings.cpu_recompiler_memory_exceptions) { // m_current_instruction_in_branch_delay_slot = true EmitStoreCPUStructField(offsetof(State, current_instruction_in_branch_delay_slot), Value::FromConstantU8(1)); From 7d1747b52745fe635eac1ec54d21e71646c7a602 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 6 Sep 2020 15:18:53 +1000 Subject: [PATCH 5/8] CPU/Recompiler: Skip calling event update on interrupts --- src/core/cpu_recompiler_code_generator_aarch64.cpp | 6 ++++++ src/core/cpu_recompiler_code_generator_x64.cpp | 5 +++++ src/core/timing_event.cpp | 9 +-------- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index c262e35e5..3225be0ab 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -1998,6 +1998,12 @@ CodeBlock::HostCodePointer CodeGenerator::CompileDispatcher() m_emit->Bind(&downcount_hit); // check events then for frame done + m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, pending_ticks))); + EmitLoadGlobalAddress(9, TimingEvents::GetHeadEventPtr()); + m_emit->ldr(a64::x9, a64::MemOperand(a64::x9)); + m_emit->ldr(a64::w9, a64::MemOperand(a64::x9, offsetof(TimingEvent, m_downcount))); + m_emit->cmp(a64::w8, a64::w9); + m_emit->b(&frame_done_loop, a64::lt); EmitFunctionCall(nullptr, &TimingEvents::RunEvents); m_emit->b(&frame_done_loop); diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 82fc81cee..5770b4f4f 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -2620,6 +2620,11 @@ CodeBlock::HostCodePointer CodeGenerator::CompileDispatcher() m_emit->L(downcount_hit); // check events then for frame done + EmitLoadGlobalAddress(Xbyak::Operand::RAX, TimingEvents::GetHeadEventPtr()); + m_emit->mov(m_emit->rax, m_emit->qword[m_emit->rax]); + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rax + offsetof(TimingEvent, m_downcount)]); + m_emit->cmp(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, pending_ticks)]); + m_emit->jg(frame_done_loop); EmitFunctionCall(nullptr, &TimingEvents::RunEvents); m_emit->jmp(frame_done_loop); diff --git a/src/core/timing_event.cpp b/src/core/timing_event.cpp index 4d1595f85..e09108be1 100644 --- a/src/core/timing_event.cpp +++ b/src/core/timing_event.cpp @@ -13,7 +13,6 @@ static TimingEvent* s_active_events_tail; static TimingEvent* s_current_event = nullptr; static u32 s_active_event_count = 0; static u32 s_global_tick_counter = 0; -static u32 s_last_event_run_time = 0; u32 GetGlobalTickCounter() { @@ -28,7 +27,6 @@ void Initialize() void Reset() { s_global_tick_counter = 0; - s_last_event_run_time = 0; } void Shutdown() @@ -260,7 +258,7 @@ void RunEvents() { DebugAssert(!s_current_event); - TickCount pending_ticks = (s_global_tick_counter + CPU::GetPendingTicks()) - s_last_event_run_time; + TickCount pending_ticks = CPU::GetPendingTicks(); CPU::ResetPendingTicks(); while (pending_ticks > 0) { @@ -296,7 +294,6 @@ void RunEvents() } } - s_last_event_run_time = s_global_tick_counter; s_current_event = nullptr; UpdateCPUDowncount(); } @@ -338,8 +335,6 @@ bool DoState(StateWrapper& sw) event->m_interval = interval; } - sw.Do(&s_last_event_run_time); - Log_DevPrintf("Loaded %u events from save state.", event_count); SortEvents(); } @@ -357,8 +352,6 @@ bool DoState(StateWrapper& sw) sw.Do(&event->m_interval); } - sw.Do(&s_last_event_run_time); - Log_DevPrintf("Wrote %u events to save state.", s_active_event_count); } From 3dd717aca8ea68eea74e2eaf93f45247e4840969 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 6 Sep 2020 21:07:03 +1000 Subject: [PATCH 6/8] Common: Add memory arena and page fault handler classes --- src/common/CMakeLists.txt | 9 ++ src/common/common.vcxproj | 6 +- src/common/common.vcxproj.filters | 6 +- src/common/memory_arena.cpp | 213 ++++++++++++++++++++++++++++++ src/common/memory_arena.h | 58 ++++++++ src/common/page_fault_handler.cpp | 185 ++++++++++++++++++++++++++ src/common/page_fault_handler.h | 18 +++ 7 files changed, 493 insertions(+), 2 deletions(-) create mode 100644 src/common/memory_arena.cpp create mode 100644 src/common/memory_arena.h create mode 100644 src/common/page_fault_handler.cpp create mode 100644 src/common/page_fault_handler.h diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index b7600fd75..f224c63d8 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -56,6 +56,10 @@ add_library(common minizip_helpers.h null_audio_stream.cpp null_audio_stream.h + memory_arena.cpp + memory_arena.h + page_fault_handler.cpp + page_fault_handler.h rectangle.h progress_callback.cpp progress_callback.h @@ -180,3 +184,8 @@ if(APPLE AND NOT BUILD_LIBRETRO_CORE) gl/context_agl.h ) endif() + +if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + # We need -lrt for shm_unlink + target_link_libraries(common PRIVATE rt) +endif() diff --git a/src/common/common.vcxproj b/src/common/common.vcxproj index f81abf94c..fd08eb5c1 100644 --- a/src/common/common.vcxproj +++ b/src/common/common.vcxproj @@ -70,6 +70,8 @@ + + @@ -130,6 +132,8 @@ + + @@ -543,4 +547,4 @@ - \ No newline at end of file + diff --git a/src/common/common.vcxproj.filters b/src/common/common.vcxproj.filters index 2903f3d34..62268c8f4 100644 --- a/src/common/common.vcxproj.filters +++ b/src/common/common.vcxproj.filters @@ -103,6 +103,8 @@ + + @@ -198,6 +200,8 @@ + + @@ -213,4 +217,4 @@ {642ff5eb-af39-4aab-a42f-6eb8188a11d7} - \ No newline at end of file + diff --git a/src/common/memory_arena.cpp b/src/common/memory_arena.cpp new file mode 100644 index 000000000..cdb52ff11 --- /dev/null +++ b/src/common/memory_arena.cpp @@ -0,0 +1,213 @@ +#include "memory_arena.h" +#include "common/assert.h" +#include "common/log.h" +#include "common/string_util.h" +Log_SetChannel(Common::MemoryArena); + +#if defined(WIN32) +#include "common/windows_headers.h" +#elif defined(__linux__) || defined(__ANDROID__) +#include +#include +#include +#include +#endif + +namespace Common { + +MemoryArena::MemoryArena() = default; + +MemoryArena::~MemoryArena() +{ +#if defined(WIN32) + if (m_file_handle) + CloseHandle(m_file_handle); +#elif defined(__linux__) + if (m_shmem_fd > 0) + close(m_shmem_fd); +#endif +} + +void* MemoryArena::FindBaseAddressForMapping(size_t size) +{ + void* base_address; +#if defined(WIN32) + base_address = VirtualAlloc(nullptr, size, MEM_RESERVE, PAGE_READWRITE); + if (base_address) + VirtualFree(base_address, 0, MEM_RELEASE); +#elif defined(__linux__) + base_address = mmap(nullptr, size, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); + if (base_address) + munmap(base_address, size); +#elif defined(__ANDROID__) + base_address = mmap(nullptr, size, PROT_NONE, MAP_ANON | MAP_SHARED, -1, 0); + if (base_address) + munmap(base_address, size); +#else + base_address = nullptr; +#endif + + if (!base_address) + { + Log_ErrorPrintf("Failed to get base address for memory mapping of size %zu", size); + return nullptr; + } + + return base_address; +} + +bool MemoryArena::Create(size_t size, bool writable, bool executable) +{ +#if defined(WIN32) + const std::string file_mapping_name = + StringUtil::StdStringFromFormat("common_memory_arena_%zu_%u", size, GetCurrentProcessId()); + + const DWORD protect = (writable ? (executable ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE) : PAGE_READONLY); + m_file_handle = CreateFileMappingA(INVALID_HANDLE_VALUE, nullptr, protect, Truncate32(size >> 32), Truncate32(size), + file_mapping_name.c_str()); + if (!m_file_handle) + { + Log_ErrorPrintf("CreateFileMapping failed: %u", GetLastError()); + return false; + } + + return true; +#elif defined(__linux__) + const std::string file_mapping_name = + StringUtil::StdStringFromFormat("common_memory_arena_%zu_%u", size, static_cast(getpid())); + m_shmem_fd = shm_open(file_mapping_name.c_str(), O_CREAT | O_EXCL | (writable ? O_RDWR : O_RDONLY), 0600); + if (m_shmem_fd < 0) + { + Log_ErrorPrintf("shm_open failed: %d", errno); + return false; + } + + // we're not going to be opening this mapping in other processes, so remove the file + shm_unlink(file_mapping_name.c_str()); + + // ensure it's the correct size + if (ftruncate64(m_shmem_fd, static_cast(size)) < 0) + { + Log_ErrorPrintf("ftruncate64(%zu) failed: %d", size, errno); + return false; + } + + return true; +#else + return false; +#endif +} + +std::optional MemoryArena::CreateView(size_t offset, size_t size, bool writable, bool executable, + void* fixed_address) +{ + void* base_pointer = CreateViewPtr(offset, size, writable, executable, fixed_address); + if (!base_pointer) + return std::nullopt; + + return View(this, base_pointer, offset, size, writable); +} + +void* MemoryArena::CreateViewPtr(size_t offset, size_t size, bool writable, bool executable, + void* fixed_address /*= nullptr*/) +{ + void* base_pointer; +#if defined(WIN32) + const DWORD desired_access = FILE_MAP_READ | (writable ? FILE_MAP_WRITE : 0) | (executable ? FILE_MAP_EXECUTE : 0); + base_pointer = + MapViewOfFileEx(m_file_handle, desired_access, Truncate32(offset >> 32), Truncate32(offset), size, fixed_address); + if (!base_pointer) + return nullptr; +#elif defined(__linux__) + const int flags = (fixed_address != nullptr) ? (MAP_SHARED | MAP_FIXED) : MAP_SHARED; + const int prot = PROT_READ | (writable ? PROT_WRITE : 0) | (executable ? PROT_EXEC : 0); + base_pointer = mmap64(fixed_address, size, prot, flags, m_shmem_fd, static_cast(offset)); + if (base_pointer == reinterpret_cast(-1)) + return nullptr; +#else + return nullptr; +#endif + + m_num_views.fetch_add(1); + return base_pointer; +} + +bool MemoryArena::FlushViewPtr(void* address, size_t size) +{ +#if defined(WIN32) + return FlushViewOfFile(address, size); +#elif defined(__linux__) + return (msync(address, size, 0) >= 0); +#else + return false; +#endif +} + +bool MemoryArena::ReleaseViewPtr(void* address, size_t size) +{ + bool result; +#if defined(WIN32) + result = static_cast(UnmapViewOfFile(address)); +#elif defined(__linux__) + result = (munmap(address, size) >= 0); +#else + result = false; +#endif + + if (!result) + { + Log_ErrorPrintf("Failed to unmap previously-created view at %p", address); + return false; + } + + const size_t prev_count = m_num_views.fetch_sub(1); + Assert(prev_count > 0); + return true; +} + +bool MemoryArena::SetPageProtection(void* address, size_t length, bool readable, bool writable, bool executable) +{ +#if defined(WIN32) + static constexpr DWORD protection_table[2][2][2] = { + {{PAGE_NOACCESS, PAGE_EXECUTE}, {PAGE_WRITECOPY, PAGE_EXECUTE_WRITECOPY}}, + {{PAGE_READONLY, PAGE_EXECUTE_READ}, {PAGE_READWRITE, PAGE_EXECUTE_READWRITE}}}; + + DWORD old_protect; + return static_cast( + VirtualProtect(address, length, protection_table[readable][writable][executable], &old_protect)); +#elif defined(__linux__) || defined(__ANDROID__) + const int prot = (readable ? PROT_READ : 0) | (writable ? PROT_WRITE : 0) | (executable ? PROT_EXEC : 0); + return (mprotect(address, length, prot) >= 0); +#else + return false; +#endif +} + +MemoryArena::View::View(MemoryArena* parent, void* base_pointer, size_t arena_offset, size_t mapping_size, + bool writable) + : m_parent(parent), m_base_pointer(base_pointer), m_arena_offset(arena_offset), m_mapping_size(mapping_size), + m_writable(writable) +{ +} + +MemoryArena::View::View(View&& view) + : m_parent(view.m_parent), m_base_pointer(view.m_base_pointer), m_arena_offset(view.m_arena_offset), + m_mapping_size(view.m_mapping_size) +{ + view.m_parent = nullptr; + view.m_base_pointer = nullptr; + view.m_arena_offset = 0; + view.m_mapping_size = 0; +} + +MemoryArena::View::~View() +{ + if (m_parent) + { + if (m_writable && !m_parent->FlushViewPtr(m_base_pointer, m_mapping_size)) + Panic("Failed to flush previously-created view"); + if (!m_parent->ReleaseViewPtr(m_base_pointer, m_mapping_size)) + Panic("Failed to unmap previously-created view"); + } +} +} // namespace Common diff --git a/src/common/memory_arena.h b/src/common/memory_arena.h new file mode 100644 index 000000000..8e175bd47 --- /dev/null +++ b/src/common/memory_arena.h @@ -0,0 +1,58 @@ +#pragma once +#include "types.h" +#include +#include + +namespace Common { +class MemoryArena +{ +public: + class View + { + public: + View(MemoryArena* parent, void* base_pointer, size_t arena_offset, size_t mapping_size, bool writable); + View(View&& view); + ~View(); + + void* GetBasePointer() const { return m_base_pointer; } + size_t GetArenaOffset() const { return m_arena_offset; } + size_t GetMappingSize() const { return m_mapping_size; } + bool IsWritable() const { return m_writable; } + + private: + MemoryArena* m_parent; + void* m_base_pointer; + size_t m_arena_offset; + size_t m_mapping_size; + bool m_writable; + }; + + MemoryArena(); + ~MemoryArena(); + + static void* FindBaseAddressForMapping(size_t size); + + bool Create(size_t size, bool writable, bool executable); + + std::optional CreateView(size_t offset, size_t size, bool writable, bool executable, + void* fixed_address = nullptr); + + void* CreateViewPtr(size_t offset, size_t size, bool writable, bool executable, void* fixed_address = nullptr); + bool FlushViewPtr(void* address, size_t size); + bool ReleaseViewPtr(void* address, size_t size); + + static bool SetPageProtection(void* address, size_t length, bool readable, bool writable, bool executable); + +private: +#if defined(WIN32) + void* m_file_handle = nullptr; +#elif defined(__linux__) + int m_shmem_fd = -1; +#endif + + std::atomic_size_t m_num_views{0}; + size_t m_size = 0; + bool m_writable = false; + bool m_executable = false; +}; +} // namespace Common diff --git a/src/common/page_fault_handler.cpp b/src/common/page_fault_handler.cpp new file mode 100644 index 000000000..67d3192b2 --- /dev/null +++ b/src/common/page_fault_handler.cpp @@ -0,0 +1,185 @@ +#include "page_fault_handler.h" +#include "common/log.h" +#include +#include +#include +Log_SetChannel(Common::PageFaultHandler); + +#if defined(WIN32) +#include "common/windows_headers.h" +#elif defined(__linux__) || defined(__ANDROID__) +#include +#include +#include +#define USE_SIGSEGV 1 +#endif + +namespace Common::PageFaultHandler { + +struct RegisteredHandler +{ + void* owner; + Callback callback; +}; +static std::vector m_handlers; +static std::mutex m_handler_lock; +static thread_local bool s_in_handler; + +#if defined(WIN32) +static PVOID s_veh_handle; + +static LONG ExceptionHandler(PEXCEPTION_POINTERS exi) +{ + if (exi->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION || s_in_handler) + return EXCEPTION_CONTINUE_SEARCH; + + s_in_handler = true; + + void* const exception_pc = reinterpret_cast(exi->ContextRecord->Rip); + void* const exception_address = reinterpret_cast(exi->ExceptionRecord->ExceptionInformation[1]); + bool const is_write = exi->ExceptionRecord->ExceptionInformation[0] == 1; + + std::lock_guard guard(m_handler_lock); + for (const RegisteredHandler& rh : m_handlers) + { + if (rh.callback(exception_pc, exception_address, is_write) == HandlerResult::ContinueExecution) + { + s_in_handler = false; + return EXCEPTION_CONTINUE_EXECUTION; + } + } + + s_in_handler = false; + return EXCEPTION_CONTINUE_SEARCH; +} + +#elif defined(USE_SIGSEGV) + +static struct sigaction s_old_sigsegv_action; + +static void SIGSEGVHandler(int sig, siginfo_t* info, void* ctx) +{ + if ((info->si_code != SEGV_MAPERR && info->si_code != SEGV_ACCERR) || s_in_handler) + return; + + void* const exception_address = reinterpret_cast(info->si_addr); + +#if defined(__x86_64__) + void* const exception_pc = reinterpret_cast(static_cast(ctx)->uc_mcontext.gregs[REG_RIP]); + const bool is_write = (static_cast(ctx)->uc_mcontext.gregs[REG_ERR] & 2) != 0; +#elif defined(__aarch64__) + void* const exception_pc = reinterpret_cast(static_cast(ctx)->uc_mcontext.pc); + const bool is_write = false; +#else + void* const exception_pc = nullptr; + const bool is_write = false; +#endif + + std::lock_guard guard(m_handler_lock); + for (const RegisteredHandler& rh : m_handlers) + { + if (rh.callback(exception_pc, exception_address, is_write) == HandlerResult::ContinueExecution) + { + s_in_handler = false; + return; + } + } + + // call old signal handler + if (s_old_sigsegv_action.sa_flags & SA_SIGINFO) + s_old_sigsegv_action.sa_sigaction(sig, info, ctx); + else if (s_old_sigsegv_action.sa_handler == SIG_DFL) + signal(sig, SIG_DFL); + else if (s_old_sigsegv_action.sa_handler == SIG_IGN) + return; + else + s_old_sigsegv_action.sa_handler(sig); +} + +#endif + +bool InstallHandler(void* owner, Callback callback) +{ + bool was_empty; + { + std::lock_guard guard(m_handler_lock); + if (std::find_if(m_handlers.begin(), m_handlers.end(), + [owner](const RegisteredHandler& rh) { return rh.owner == owner; }) != m_handlers.end()) + { + return false; + } + + was_empty = m_handlers.empty(); + m_handlers.push_back(RegisteredHandler{owner, std::move(callback)}); + } + + if (was_empty) + { +#if defined(WIN32) + s_veh_handle = AddVectoredExceptionHandler(1, ExceptionHandler); + if (!s_veh_handle) + { + Log_ErrorPrint("Failed to add vectored exception handler"); + return false; + } +#elif defined(USE_SIGSEGV) +#if 0 + // TODO: Is this needed? + stack_t signal_stack = {}; + signal_stack.ss_sp = malloc(SIGSTKSZ); + signal_stack.ss_size = SIGSTKSZ; + if (sigaltstack(&signal_stack, nullptr)) + { + Log_ErrorPrintf("signaltstack() failed: %d", errno); + return false; + } +#endif + + struct sigaction sa = {}; + sa.sa_sigaction = SIGSEGVHandler; + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, &s_old_sigsegv_action) < 0) + { + Log_ErrorPrintf("sigaction() failed: %d", errno); + return false; + } +#else + return false; +#endif + } + + return true; +} + +bool RemoveHandler(void* owner) +{ + std::lock_guard guard(m_handler_lock); + auto it = std::find_if(m_handlers.begin(), m_handlers.end(), + [owner](const RegisteredHandler& rh) { return rh.owner == owner; }); + if (it == m_handlers.end()) + return false; + + m_handlers.erase(it); + + if (m_handlers.empty()) + { +#if defined(WIN32) + RemoveVectoredExceptionHandler(s_veh_handle); + s_veh_handle = nullptr; +#else + // restore old signal handler + if (sigaction(SIGSEGV, &s_old_sigsegv_action, nullptr) < 0) + { + Log_ErrorPrintf("sigaction() failed: %d", errno); + return false; + } + + s_old_sigsegv_action = {}; +#endif + } + + return true; +} + +} // namespace Common::PageFaultHandler diff --git a/src/common/page_fault_handler.h b/src/common/page_fault_handler.h new file mode 100644 index 000000000..b2c4f9040 --- /dev/null +++ b/src/common/page_fault_handler.h @@ -0,0 +1,18 @@ +#pragma once +#include "types.h" +#include + +namespace Common::PageFaultHandler { +enum class HandlerResult +{ + ContinueExecution, + ExecuteNextHandler, +}; + +using Callback = std::function; +using Handle = void*; + +bool InstallHandler(void* owner, Callback callback); +bool RemoveHandler(void* owner); + +} // namespace Common::PageFaultHandler From 5e45330703c72e7595d177c187e5e1517b42c709 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 6 Sep 2020 21:07:09 +1000 Subject: [PATCH 7/8] WIP fastmem --- src/common/page_fault_handler.cpp | 1 + src/common/page_fault_handler.h | 3 +- src/core/bus.cpp | 214 +++++++++- src/core/bus.h | 58 ++- src/core/cpu_code_cache.cpp | 194 ++++++++- src/core/cpu_code_cache.h | 16 +- src/core/cpu_core.cpp | 6 + src/core/cpu_core.h | 2 + src/core/cpu_recompiler_code_generator.cpp | 23 +- src/core/cpu_recompiler_code_generator.h | 11 +- .../cpu_recompiler_code_generator_aarch64.cpp | 283 ++++++++++--- .../cpu_recompiler_code_generator_x64.cpp | 397 +++++++++++++++--- src/core/cpu_recompiler_thunks.h | 1 + src/core/cpu_recompiler_types.h | 10 + src/core/host_interface.cpp | 12 +- src/core/settings.cpp | 2 + src/core/settings.h | 6 + src/core/system.cpp | 8 +- src/core/types.h | 2 +- 19 files changed, 1070 insertions(+), 179 deletions(-) diff --git a/src/common/page_fault_handler.cpp b/src/common/page_fault_handler.cpp index 67d3192b2..448783475 100644 --- a/src/common/page_fault_handler.cpp +++ b/src/common/page_fault_handler.cpp @@ -3,6 +3,7 @@ #include #include #include +#include Log_SetChannel(Common::PageFaultHandler); #if defined(WIN32) diff --git a/src/common/page_fault_handler.h b/src/common/page_fault_handler.h index b2c4f9040..67ef38cbd 100644 --- a/src/common/page_fault_handler.h +++ b/src/common/page_fault_handler.h @@ -1,6 +1,5 @@ #pragma once #include "types.h" -#include namespace Common::PageFaultHandler { enum class HandlerResult @@ -9,7 +8,7 @@ enum class HandlerResult ExecuteNextHandler, }; -using Callback = std::function; +using Callback = HandlerResult(*)(void* exception_pc, void* fault_address, bool is_write); using Handle = void*; bool InstallHandler(void* owner, Callback callback); diff --git a/src/core/bus.cpp b/src/core/bus.cpp index f238e6b17..340bca2c5 100644 --- a/src/core/bus.cpp +++ b/src/core/bus.cpp @@ -10,6 +10,7 @@ #include "cpu_disasm.h" #include "dma.h" #include "gpu.h" +#include "host_interface.h" #include "interrupt_controller.h" #include "mdec.h" #include "pad.h" @@ -22,11 +23,6 @@ Log_SetChannel(Bus); namespace Bus { -enum : TickCount -{ - RAM_READ_TICKS = 4 -}; - union MEMDELAY { u32 bits; @@ -74,8 +70,8 @@ union MEMCTRL }; std::bitset m_ram_code_bits{}; -u8 g_ram[RAM_SIZE]{}; // 2MB RAM -u8 g_bios[BIOS_SIZE]{}; // 512K BIOS ROM +u8* g_ram = nullptr; // 2MB RAM +u8* g_bios = nullptr; // 512K BIOS ROM static std::array m_exp1_access_time = {}; static std::array m_exp2_access_time = {}; @@ -90,9 +86,15 @@ static u32 m_ram_size_reg = 0; static std::string m_tty_line_buffer; +static Common::MemoryArena m_memory_arena; +static u8* m_fastmem_base = nullptr; +static std::vector m_fastmem_ram_views; + static std::tuple CalculateMemoryTiming(MEMDELAY mem_delay, COMDELAY common_delay); static void RecalculateMemoryTimings(); +static void SetCodePageFastmemProtection(u32 page_index, bool writable); + #define FIXUP_WORD_READ_OFFSET(offset) ((offset) & ~u32(3)) #define FIXUP_WORD_READ_VALUE(offset, value) ((value) >> (((offset)&u32(3)) * 8u)) #define FIXUP_HALFWORD_READ_OFFSET(offset) ((offset) & ~u32(1)) @@ -108,19 +110,32 @@ ALWAYS_INLINE static void FixupUnalignedWordAccessW32(u32& offset, u32& value) value <<= byte_offset * 8; } -void Initialize() +bool Initialize() { + if (!AllocateMemory()) + { + g_host_interface->ReportError("Failed to allocate memory"); + return false; + } + Reset(); + return true; } void Shutdown() { - // + m_fastmem_ram_views.clear(); + if (g_ram) + m_memory_arena.ReleaseViewPtr(g_ram, RAM_SIZE); + if (g_bios) + m_memory_arena.ReleaseViewPtr(g_bios, BIOS_SIZE); + + CPU::g_state.fastmem_base = nullptr; } void Reset() { - std::memset(g_ram, 0, sizeof(g_ram)); + std::memset(g_ram, 0, RAM_SIZE); m_MEMCTRL.exp1_base = 0x1F000000; m_MEMCTRL.exp2_base = 0x1F802000; m_MEMCTRL.exp1_delay_size.bits = 0x0013243F; @@ -142,8 +157,8 @@ bool DoState(StateWrapper& sw) sw.Do(&m_bios_access_time); sw.Do(&m_cdrom_access_time); sw.Do(&m_spu_access_time); - sw.DoBytes(g_ram, sizeof(g_ram)); - sw.DoBytes(g_bios, sizeof(g_bios)); + sw.DoBytes(g_ram, RAM_SIZE); + sw.DoBytes(g_bios, BIOS_SIZE); sw.DoArray(m_MEMCTRL.regs, countof(m_MEMCTRL.regs)); sw.Do(&m_ram_size_reg); sw.Do(&m_tty_line_buffer); @@ -222,6 +237,181 @@ void RecalculateMemoryTimings() m_spu_access_time[2] + 1); } +bool AllocateMemory() +{ + if (!m_memory_arena.Create(MEMORY_ARENA_SIZE, true, false)) + { + Log_ErrorPrint("Failed to create memory arena"); + return false; + } + + // Create the base views. + g_ram = static_cast(m_memory_arena.CreateViewPtr(MEMORY_ARENA_RAM_OFFSET, RAM_SIZE, true, false)); + g_bios = static_cast(m_memory_arena.CreateViewPtr(MEMORY_ARENA_BIOS_OFFSET, BIOS_SIZE, true, false)); + if (!g_ram || !g_bios) + { + Log_ErrorPrint("Failed to create base views of memory"); + return false; + } + + return true; +} + +void UpdateFastmemViews(bool enabled, bool isolate_cache) +{ + m_fastmem_ram_views.clear(); + if (!enabled) + { + m_fastmem_base = nullptr; + return; + } + + Log_DevPrintf("Remapping fastmem area, isolate cache = %s", isolate_cache ? "true " : "false"); + if (!m_fastmem_base) + { + m_fastmem_base = static_cast(m_memory_arena.FindBaseAddressForMapping(FASTMEM_REGION_SIZE)); + if (!m_fastmem_base) + { + Log_ErrorPrint("Failed to find base address for fastmem"); + return; + } + + Log_InfoPrintf("Fastmem base: %p", m_fastmem_base); + CPU::g_state.fastmem_base = m_fastmem_base; + } + + auto MapRAM = [](u32 base_address) { + u8* map_address = m_fastmem_base + base_address; + auto view = m_memory_arena.CreateView(MEMORY_ARENA_RAM_OFFSET, RAM_SIZE, true, false, map_address); + if (!view) + { + Log_ErrorPrintf("Failed to map RAM at fastmem area %p (offset 0x%08X)", map_address, RAM_SIZE); + return; + } + + // mark all pages with code as non-writable + for (u32 i = 0; i < CPU_CODE_CACHE_PAGE_COUNT; i++) + { + if (m_ram_code_bits[i]) + { + u8* page_address = map_address + (i * CPU_CODE_CACHE_PAGE_SIZE); + if (!m_memory_arena.SetPageProtection(page_address, CPU_CODE_CACHE_PAGE_SIZE, true, false, false)) + { + Log_ErrorPrintf("Failed to write-protect code page at %p"); + return; + } + } + } + + m_fastmem_ram_views.push_back(std::move(view.value())); + }; + auto MapBIOS = [](u32 base_address) { + u8* map_address = m_fastmem_base + base_address; + auto view = m_memory_arena.CreateView(MEMORY_ARENA_BIOS_OFFSET, BIOS_SIZE, false, false, map_address); + if (!view) + { + Log_ErrorPrintf("Failed to map BIOS at fastmem area %p (offset 0x%08X)", map_address, RAM_SIZE); + return; + } + + m_fastmem_ram_views.push_back(std::move(view.value())); + }; + + if (!isolate_cache) + { + // KUSEG - cached + MapRAM(0x00000000); + // MapBIOS(0x1FC00000); + + // KSEG0 - cached + MapRAM(0x80000000); + // MapBIOS(0x9FC00000); + } + + // KSEG1 - uncached + MapRAM(0xA0000000); + // MapBIOS(0xBFC00000); +} + +bool IsRAMCodePage(u32 index) +{ + return m_ram_code_bits[index]; +} + +void SetRAMCodePage(u32 index) +{ + if (m_ram_code_bits[index]) + return; + + // protect fastmem pages + m_ram_code_bits[index] = true; + SetCodePageFastmemProtection(index, false); +} + +void ClearRAMCodePage(u32 index) +{ + if (!m_ram_code_bits[index]) + return; + + // unprotect fastmem pages + m_ram_code_bits[index] = false; + SetCodePageFastmemProtection(index, true); +} + +void SetCodePageFastmemProtection(u32 page_index, bool writable) +{ + // unprotect fastmem pages + for (const auto& view : m_fastmem_ram_views) + { + u8* page_address = static_cast(view.GetBasePointer()) + (page_index * CPU_CODE_CACHE_PAGE_SIZE); + if (!m_memory_arena.SetPageProtection(page_address, CPU_CODE_CACHE_PAGE_SIZE, true, writable, false)) + { + Log_ErrorPrintf("Failed to %s code page %u (0x%08X) @ %p", writable ? "unprotect" : "protect", page_index, + page_index * CPU_CODE_CACHE_PAGE_SIZE, page_address); + } + } +} + +void ClearRAMCodePageFlags() +{ + m_ram_code_bits.reset(); + + // unprotect fastmem pages + for (const auto& view : m_fastmem_ram_views) + { + if (!m_memory_arena.SetPageProtection(view.GetBasePointer(), view.GetMappingSize(), true, true, false)) + { + Log_ErrorPrintf("Failed to unprotect code pages for fastmem view @ %p", view.GetBasePointer()); + } + } +} + +bool IsCodePageAddress(PhysicalMemoryAddress address) +{ + return IsRAMAddress(address) ? m_ram_code_bits[(address & RAM_MASK) / CPU_CODE_CACHE_PAGE_SIZE] : false; +} + +bool HasCodePagesInRange(PhysicalMemoryAddress start_address, u32 size) +{ + if (!IsRAMAddress(start_address)) + return false; + + start_address = (start_address & RAM_MASK); + + const u32 end_address = start_address + size; + while (start_address < end_address) + { + const u32 code_page_index = start_address / CPU_CODE_CACHE_PAGE_SIZE; + if (m_ram_code_bits[code_page_index]) + return true; + + start_address += CPU_CODE_CACHE_PAGE_SIZE; + } + + return false; +} + + static TickCount DoInvalidAccess(MemoryAccessType type, MemoryAccessSize size, PhysicalMemoryAddress address, u32& value) { diff --git a/src/core/bus.h b/src/core/bus.h index d2f187ba6..ef1905bee 100644 --- a/src/core/bus.h +++ b/src/core/bus.h @@ -1,5 +1,6 @@ #pragma once #include "common/bitfield.h" +#include "common/memory_arena.h" #include "types.h" #include #include @@ -65,26 +66,69 @@ enum : u32 MEMCTRL_REG_COUNT = 9 }; -void Initialize(); +enum : TickCount +{ + RAM_READ_TICKS = 4 +}; + +enum : size_t +{ + // Our memory arena contains storage for RAM and BIOS. + MEMORY_ARENA_SIZE = RAM_SIZE + BIOS_SIZE, + + // Offsets within the memory arena. + MEMORY_ARENA_RAM_OFFSET = 0, + MEMORY_ARENA_BIOS_OFFSET = MEMORY_ARENA_RAM_OFFSET + RAM_SIZE, + + // Fastmem region size is 4GB to cover the entire 32-bit address space. + FASTMEM_REGION_SIZE = UINT64_C(0x100000000) +}; + +bool Initialize(); void Shutdown(); void Reset(); bool DoState(StateWrapper& sw); +u8* GetFastmemBase(); +bool AllocateMemory(); +void UpdateFastmemViews(bool enabled, bool isolate_cache); + void SetExpansionROM(std::vector data); void SetBIOS(const std::vector& image); extern std::bitset m_ram_code_bits; -extern u8 g_ram[RAM_SIZE]; // 2MB RAM -extern u8 g_bios[BIOS_SIZE]; // 512K BIOS ROM +extern u8* g_ram; // 2MB RAM +extern u8* g_bios; // 512K BIOS ROM + +/// Returns true if the address specified is writable (RAM). +ALWAYS_INLINE static bool IsRAMAddress(PhysicalMemoryAddress address) +{ + return address < RAM_MIRROR_END; +} + +/// Returns the code page index for a RAM address. +ALWAYS_INLINE static u32 GetRAMCodePageIndex(PhysicalMemoryAddress address) +{ + return (address & RAM_MASK) / CPU_CODE_CACHE_PAGE_SIZE; +} + +/// Returns true if the specified page contains code. +bool IsRAMCodePage(u32 index); /// Flags a RAM region as code, so we know when to invalidate blocks. -ALWAYS_INLINE void SetRAMCodePage(u32 index) { m_ram_code_bits[index] = true; } +void SetRAMCodePage(u32 index); /// Unflags a RAM region as code, the code cache will no longer be notified when writes occur. -ALWAYS_INLINE void ClearRAMCodePage(u32 index) { m_ram_code_bits[index] = false; } +void ClearRAMCodePage(u32 index); /// Clears all code bits for RAM regions. -ALWAYS_INLINE void ClearRAMCodePageFlags() { m_ram_code_bits.reset(); } +void ClearRAMCodePageFlags(); + +/// Returns true if the specified address is in a code page. +bool IsCodePageAddress(PhysicalMemoryAddress address); + +/// Returns true if the range specified overlaps with a code page. +bool HasCodePagesInRange(PhysicalMemoryAddress start_address, u32 size); /// Returns the number of cycles stolen by DMA RAM access. ALWAYS_INLINE TickCount GetDMARAMTickCount(u32 word_count) @@ -96,4 +140,4 @@ ALWAYS_INLINE TickCount GetDMARAMTickCount(u32 word_count) return static_cast(word_count + ((word_count + 15) / 16)); } -} // namespace Bus +} // namespace Bus \ No newline at end of file diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index e4a1a55e5..041dd81aa 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -5,6 +5,7 @@ #include "cpu_core.h" #include "cpu_core_private.h" #include "cpu_disasm.h" +#include "settings.h" #include "system.h" #include "timing_event.h" Log_SetChannel(CPU::CodeCache); @@ -61,6 +62,7 @@ static void SetFastMap(u32 pc, CodeBlock::HostCodePointer function) #endif using BlockMap = std::unordered_map; +using HostCodeMap = std::map; void LogCurrentState(); @@ -85,36 +87,49 @@ static void LinkBlock(CodeBlock* from, CodeBlock* to); /// Unlink all blocks which point to this block, and any that this block links to. static void UnlinkBlock(CodeBlock* block); -static bool s_use_recompiler = false; static BlockMap s_blocks; static std::array, CPU_CODE_CACHE_PAGE_COUNT> m_ram_block_map; -void Initialize(bool use_recompiler) +#ifdef WITH_RECOMPILER +static HostCodeMap s_host_code_map; + +static void AddBlockToHostCodeMap(CodeBlock* block); +static void RemoveBlockFromHostCodeMap(CodeBlock* block); +static bool InitializeFastmem(); +static void ShutdownFastmem(); +static Common::PageFaultHandler::HandlerResult PageFaultHandler(void* exception_pc, void* fault_address, bool is_write); +#endif + +void Initialize() { Assert(s_blocks.empty()); #ifdef WITH_RECOMPILER - s_use_recompiler = use_recompiler; -#ifdef USE_STATIC_CODE_BUFFER - if (!s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE, - RECOMPILER_GUARD_SIZE)) -#else - if (!s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE)) -#endif + if (g_settings.IsUsingRecompiler()) { - Panic("Failed to initialize code space"); - } - - ResetFastMap(); - CompileDispatcher(); +#ifdef USE_STATIC_CODE_BUFFER + if (!s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE, + RECOMPILER_GUARD_SIZE)) #else - s_use_recompiler = false; + if (!s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE)) +#endif + { + Panic("Failed to initialize code space"); + } + + if (g_settings.IsUsingFastmem() && !InitializeFastmem()) + Panic("Failed to initialize fastmem"); + + ResetFastMap(); + CompileDispatcher(); + } #endif } void Shutdown() { Flush(); + ShutdownFastmem(); #ifdef WITH_RECOMPILER s_code_buffer.Destroy(); #endif @@ -279,14 +294,33 @@ void ExecuteRecompiler() #endif -void SetUseRecompiler(bool enable) +void Reinitialize() { -#ifdef WITH_RECOMPILER - if (s_use_recompiler == enable) - return; - - s_use_recompiler = enable; Flush(); +#ifdef WITH_RECOMPILER + + ShutdownFastmem(); + s_code_buffer.Destroy(); + + if (g_settings.IsUsingRecompiler()) + { + +#ifdef USE_STATIC_CODE_BUFFER + if (!s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE, + RECOMPILER_GUARD_SIZE)) +#else + if (!s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE)) +#endif + { + Panic("Failed to initialize code space"); + } + + if (g_settings.IsUsingFastmem() && !InitializeFastmem()) + Panic("Failed to initialize fastmem"); + + ResetFastMap(); + CompileDispatcher(); + } #endif } @@ -298,8 +332,10 @@ void Flush() for (const auto& it : s_blocks) delete it.second; + s_blocks.clear(); #ifdef WITH_RECOMPILER + s_host_code_map.clear(); s_code_buffer.Reset(); ResetFastMap(); CompileDispatcher(); @@ -358,6 +394,8 @@ CodeBlock* LookupBlock(CodeBlockKey key) } iter = s_blocks.emplace(key.bits, block).first; + AddBlockToHostCodeMap(block); + return block; } @@ -384,6 +422,8 @@ bool RevalidateBlock(CodeBlock* block) return true; recompile: + RemoveBlockFromHostCodeMap(block); + block->instructions.clear(); if (!CompileBlock(block)) { @@ -393,6 +433,7 @@ recompile: } // re-add to page map again + AddBlockToHostCodeMap(block); if (block->IsInRAM()) AddBlockToPageMap(block); @@ -439,6 +480,9 @@ bool CompileBlock(CodeBlock* block) block->uncached_fetch_ticks += GetInstructionReadTicks(pc); } + block->contains_loadstore_instructions |= cbi.is_load_instruction; + block->contains_loadstore_instructions |= cbi.is_store_instruction; + // instruction is decoded now block->instructions.push_back(cbi); pc += sizeof(cbi.instruction.bits); @@ -481,7 +525,7 @@ bool CompileBlock(CodeBlock* block) } #ifdef WITH_RECOMPILER - if (s_use_recompiler) + if (g_settings.IsUsingRecompiler()) { // Ensure we're not going to run out of space while compiling this block. if (s_code_buffer.GetFreeCodeSpace() < @@ -552,6 +596,9 @@ void FlushBlock(CodeBlock* block) RemoveBlockFromPageMap(block); UnlinkBlock(block); +#ifdef WITH_RECOMPILER + RemoveBlockFromHostCodeMap(block); +#endif s_blocks.erase(iter); delete block; @@ -613,4 +660,107 @@ void UnlinkBlock(CodeBlock* block) block->link_successors.clear(); } +#ifdef WITH_RECOMPILER + +void AddBlockToHostCodeMap(CodeBlock* block) +{ + if (!g_settings.IsUsingRecompiler()) + return; + + auto ir = s_host_code_map.emplace(block->host_code, block); + Assert(ir.second); +} + +void RemoveBlockFromHostCodeMap(CodeBlock* block) +{ + if (!g_settings.IsUsingRecompiler()) + return; + + HostCodeMap::iterator hc_iter = s_host_code_map.find(block->host_code); + Assert(hc_iter != s_host_code_map.end()); + s_host_code_map.erase(hc_iter); +} + +bool InitializeFastmem() +{ + if (!Common::PageFaultHandler::InstallHandler(&s_host_code_map, PageFaultHandler)) + { + Log_ErrorPrintf("Failed to install page fault handler"); + return false; + } + + Bus::UpdateFastmemViews(true, g_state.cop0_regs.sr.Isc); + return true; +} + +void ShutdownFastmem() +{ + Common::PageFaultHandler::RemoveHandler(&s_host_code_map); + Bus::UpdateFastmemViews(false, false); +} + +Common::PageFaultHandler::HandlerResult PageFaultHandler(void* exception_pc, void* fault_address, bool is_write) +{ + if (static_cast(fault_address) < g_state.fastmem_base || + (static_cast(fault_address) - g_state.fastmem_base) >= Bus::FASTMEM_REGION_SIZE) + { + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + } + + const PhysicalMemoryAddress fastmem_address = + static_cast(static_cast(static_cast(fault_address) - g_state.fastmem_base)); + + Log_DevPrintf("Page fault handler invoked at PC=%p Address=%p %s, fastmem offset 0x%08X", exception_pc, fault_address, + is_write ? "(write)" : "(read)", fastmem_address); + + if (is_write && !g_state.cop0_regs.sr.Isc && Bus::IsRAMAddress(fastmem_address)) + { + // this is probably a code page, since we aren't going to fault due to requiring fastmem on RAM. + const u32 code_page_index = Bus::GetRAMCodePageIndex(fastmem_address); + if (Bus::IsRAMCodePage(code_page_index)) + { + InvalidateBlocksWithPageIndex(code_page_index); + return Common::PageFaultHandler::HandlerResult::ContinueExecution; + } + } + + // use upper_bound to find the next block after the pc + HostCodeMap::iterator upper_iter = + s_host_code_map.upper_bound(reinterpret_cast(exception_pc)); + if (upper_iter == s_host_code_map.begin()) + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + + // then decrement it by one to (hopefully) get the block we want + upper_iter--; + + // find the loadstore info in the code block + CodeBlock* block = upper_iter->second; + for (auto bpi_iter = block->loadstore_backpatch_info.begin(); bpi_iter != block->loadstore_backpatch_info.end(); + ++bpi_iter) + { + const Recompiler::LoadStoreBackpatchInfo& lbi = *bpi_iter; + if (lbi.host_pc == exception_pc) + { + // found it, do fixup + if (Recompiler::CodeGenerator::BackpatchLoadStore(lbi)) + { + // remove the backpatch entry since we won't be coming back to this one + block->loadstore_backpatch_info.erase(bpi_iter); + return Common::PageFaultHandler::HandlerResult::ContinueExecution; + } + else + { + Log_ErrorPrintf("Failed to backpatch %p in block 0x%08X", exception_pc, block->GetPC()); + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; + } + } + } + + // we didn't find the pc in our list.. + Log_ErrorPrintf("Loadstore PC not found for %p in block 0x%08X", exception_pc, block->GetPC()); + return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler; +} + +#endif + } // namespace CPU::CodeCache diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h index 5d285191b..92a8b0d0b 100644 --- a/src/core/cpu_code_cache.h +++ b/src/core/cpu_code_cache.h @@ -2,12 +2,18 @@ #include "bus.h" #include "common/bitfield.h" #include "common/jit_code_buffer.h" +#include "common/page_fault_handler.h" #include "cpu_types.h" #include +#include #include #include #include +#ifdef WITH_RECOMPILER +#include "cpu_recompiler_types.h" +#endif + namespace CPU { enum : u32 @@ -71,6 +77,12 @@ struct CodeBlock TickCount uncached_fetch_ticks = 0; u32 icache_line_count = 0; + +#ifdef WITH_RECOMPILER + std::vector loadstore_backpatch_info; +#endif + + bool contains_loadstore_instructions = false; bool invalidated = false; const u32 GetPC() const { return key.GetPC(); } @@ -89,7 +101,7 @@ struct CodeBlock namespace CodeCache { -void Initialize(bool use_recompiler); +void Initialize(); void Shutdown(); void Execute(); @@ -102,7 +114,7 @@ void ExecuteRecompiler(); void Flush(); /// Changes whether the recompiler is enabled. -void SetUseRecompiler(bool enable); +void Reinitialize(); /// Invalidates all blocks which are in the range of the specified code page. void InvalidateBlocksWithPageIndex(u32 page_index); diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 0d88e465b..a26e8cc60 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -1,4 +1,5 @@ #include "cpu_core.h" +#include "bus.h" #include "common/align.h" #include "common/file_system.h" #include "common/log.h" @@ -1563,6 +1564,11 @@ bool InterpretInstructionPGXP() return g_state.exception_raised; } +void UpdateFastmemMapping() +{ + Bus::UpdateFastmemViews(true, g_state.cop0_regs.sr.Isc); +} + } // namespace Recompiler::Thunks } // namespace CPU \ No newline at end of file diff --git a/src/core/cpu_core.h b/src/core/cpu_core.h index 43c14c99a..58d396d30 100644 --- a/src/core/cpu_core.h +++ b/src/core/cpu_core.h @@ -79,6 +79,8 @@ struct State // GTE registers are stored here so we can access them on ARM with a single instruction GTE::Regs gte_regs = {}; + u8* fastmem_base = nullptr; + // data cache (used as scratchpad) std::array dcache = {}; std::array icache_tags = {}; diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index b3f1bf4cb..084e2f001 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -19,8 +19,7 @@ u32 CodeGenerator::CalculateRegisterOffset(Reg reg) return u32(offsetof(State, regs.r[0]) + (static_cast(reg) * sizeof(u32))); } -bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, - u32* out_host_code_size) +bool CodeGenerator::CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size) { // TODO: Align code buffer. @@ -40,8 +39,10 @@ bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePoin Log_DebugPrintf("Compiling instruction '%s'", disasm.GetCharArray()); #endif + m_current_instruction = cbi; if (!CompileInstruction(*cbi)) { + m_current_instruction = nullptr; m_block_end = nullptr; m_block_start = nullptr; m_block = nullptr; @@ -60,6 +61,7 @@ bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePoin DebugAssert(m_register_cache.GetUsedHostRegisters() == 0); + m_current_instruction = nullptr; m_block_end = nullptr; m_block_start = nullptr; m_block = nullptr; @@ -1895,7 +1897,22 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi) value = AndValues(value, Value::FromConstantU32(write_mask)); } - EmitStoreCPUStructField(offset, value); + // changing SR[Isc] needs to update fastmem views + if (reg == Cop0Reg::SR && g_settings.cpu_fastmem) + { + LabelType skip_fastmem_update; + Value old_value = m_register_cache.AllocateScratch(RegSize_32); + EmitLoadCPUStructField(old_value.host_reg, RegSize_32, offset); + EmitStoreCPUStructField(offset, value); + EmitXor(old_value.host_reg, old_value.host_reg, value); + EmitBranchIfBitClear(old_value.host_reg, RegSize_32, 16, &skip_fastmem_update); + EmitFunctionCall(nullptr, &Thunks::UpdateFastmemMapping, m_register_cache.GetCPUPtr()); + EmitBindLabel(&skip_fastmem_update); + } + else + { + EmitStoreCPUStructField(offset, value); + } } } diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index 699c50d99..a5138a7dd 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -23,7 +23,9 @@ public: static const char* GetHostRegName(HostReg reg, RegSize size = HostPointerSize); static void AlignCodeBuffer(JitCodeBuffer* code_buffer); - bool CompileBlock(const CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); + static bool BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi); + + bool CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); CodeBlock::HostCodePointer CompileDispatcher(); @@ -73,7 +75,11 @@ public: // Automatically generates an exception handler. Value EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, RegSize size); + void EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, Value& result); + void EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, Value& result, bool in_far_code); void EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const Value& address, const Value& value); + void EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, const Value& value); + void EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, const Value& value, bool in_far_code); // Unconditional branch to pointer. May allocate a scratch register. void EmitBranch(const void* address, bool allow_scratch = true); @@ -204,9 +210,10 @@ private: bool Compile_cop2(const CodeBlockInstruction& cbi); JitCodeBuffer* m_code_buffer; - const CodeBlock* m_block = nullptr; + CodeBlock* m_block = nullptr; const CodeBlockInstruction* m_block_start = nullptr; const CodeBlockInstruction* m_block_end = nullptr; + const CodeBlockInstruction* m_current_instruction = nullptr; RegisterCache m_register_cache; CodeEmitter m_near_emitter; CodeEmitter m_far_emitter; diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 3225be0ab..af1db51cb 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -14,6 +14,7 @@ namespace a64 = vixl::aarch64; namespace CPU::Recompiler { constexpr HostReg RCPUPTR = 19; +constexpr HostReg RMEMBASEPTR = 20; constexpr HostReg RRETURN = 0; constexpr HostReg RARG1 = 0; constexpr HostReg RARG2 = 1; @@ -86,6 +87,11 @@ static const a64::XRegister GetCPUPtrReg() return GetHostReg64(RCPUPTR); } +static const a64::XRegister GetFastmemBasePtrReg() +{ + return GetHostReg64(RMEMBASEPTR); +} + CodeGenerator::CodeGenerator(JitCodeBuffer* code_buffer) : m_code_buffer(code_buffer), m_register_cache(*this), m_near_emitter(static_cast(code_buffer->GetFreeCodePointer()), code_buffer->GetFreeCodeSpace(), @@ -188,10 +194,21 @@ void CodeGenerator::EmitBeginBlock() // Store the CPU struct pointer. TODO: make this better. const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); + + // If there's loadstore instructions, preload the fastmem base. + if (m_block->contains_loadstore_instructions) + { + const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); + Assert(fastmem_reg_allocated); + m_emit->Ldr(GetFastmemBasePtrReg(), a64::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base))); + } } void CodeGenerator::EmitEndBlock() { + if (m_block->contains_loadstore_instructions) + m_register_cache.FreeHostReg(RMEMBASEPTR); + m_register_cache.FreeHostReg(RCPUPTR); m_register_cache.PopCalleeSavedRegisters(true); @@ -1308,12 +1325,105 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const AddPendingCycles(true); + Value result = m_register_cache.AllocateScratch(RegSize_64); + if (g_settings.IsUsingFastmem()) + { + EmitLoadGuestMemoryFastmem(cbi, address, size, result); + } + else + { + m_register_cache.FlushCallerSavedGuestRegisters(true, true); + EmitLoadGuestMemorySlowmem(cbi, address, size, result, false); + } + + // Downcast to ignore upper 56/48/32 bits. This should be a noop. + switch (size) + { + case RegSize_8: + ConvertValueSizeInPlace(&result, RegSize_8, false); + break; + + case RegSize_16: + ConvertValueSizeInPlace(&result, RegSize_16, false); + break; + + case RegSize_32: + ConvertValueSizeInPlace(&result, RegSize_32, false); + break; + + default: + UnreachableCode(); + break; + } + + return result; +} + +void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, + Value& result) +{ + // fastmem + LoadStoreBackpatchInfo bpi; + bpi.host_pc = GetCurrentNearCodePointer(); + bpi.address_host_reg = HostReg_Invalid; + bpi.value_host_reg = result.host_reg; + bpi.guest_pc = m_current_instruction->pc; + + a64::MemOperand actual_address; + if (address.IsConstant()) + { + m_emit->Mov(GetHostReg32(result.host_reg), address.constant_value); + actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(result.host_reg)); + bpi.host_pc = GetCurrentNearCodePointer(); + } + else + { + actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(address)); + } + + // TODO: movsx/zx inline here + switch (size) + { + case RegSize_8: + m_emit->Ldrb(GetHostReg32(result.host_reg), actual_address); + break; + + case RegSize_16: + m_emit->Ldrh(GetHostReg32(result.host_reg), actual_address); + break; + + case RegSize_32: + m_emit->Ldr(GetHostReg32(result.host_reg), actual_address); + break; + + default: + UnreachableCode(); + break; + } + + EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS)); + + bpi.host_code_size = static_cast( + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + + // generate slowmem fallback + bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + SwitchToFarCode(); + EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); + + // return to the block code + EmitBranch(GetCurrentNearCodePointer(), false); + + SwitchToNearCode(); + + m_block->loadstore_backpatch_info.push_back(bpi); +} + +void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, + Value& result, bool in_far_code) +{ if (g_settings.cpu_recompiler_memory_exceptions) { - // We need to use the full 64 bits here since we test the sign bit result. - Value result = m_register_cache.AllocateScratch(RegSize_64); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - // NOTE: This can leave junk in the upper bits switch (size) { @@ -1342,7 +1452,8 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const m_emit->Bind(&load_okay); // load exception path - SwitchToFarCode(); + if (!in_far_code) + SwitchToFarCode(); // cause_bits = (-result << 2) | BD | cop_n m_emit->neg(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg)); @@ -1353,37 +1464,14 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); EmitExceptionExit(); - SwitchToNearCode(); + + if (!in_far_code) + SwitchToNearCode(); m_register_cache.PopState(); - - // Downcast to ignore upper 56/48/32 bits. This should be a noop. - switch (size) - { - case RegSize_8: - ConvertValueSizeInPlace(&result, RegSize_8, false); - break; - - case RegSize_16: - ConvertValueSizeInPlace(&result, RegSize_16, false); - break; - - case RegSize_32: - ConvertValueSizeInPlace(&result, RegSize_32, false); - break; - - default: - UnreachableCode(); - break; - } - - return result; } else { - Value result = m_register_cache.AllocateScratch(RegSize_32); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - switch (size) { case RegSize_8: @@ -1402,27 +1490,6 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const UnreachableCode(); break; } - - // Downcast to ignore upper 56/48/32 bits. This should be a noop. - switch (size) - { - case RegSize_8: - ConvertValueSizeInPlace(&result, RegSize_8, false); - break; - - case RegSize_16: - ConvertValueSizeInPlace(&result, RegSize_16, false); - break; - - case RegSize_32: - break; - - default: - UnreachableCode(); - break; - } - - return result; } } @@ -1443,11 +1510,87 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const AddPendingCycles(true); + if (g_settings.IsUsingFastmem()) + { + // we need the value in a host register to store it + Value value_in_hr = GetValueInHostRegister(value); + EmitStoreGuestMemoryFastmem(cbi, address, value_in_hr); + } + else + { + m_register_cache.FlushCallerSavedGuestRegisters(true, true); + EmitStoreGuestMemorySlowmem(cbi, address, value, false); + } +} + +void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, + const Value& value) +{ + // fastmem + LoadStoreBackpatchInfo bpi; + bpi.host_pc = GetCurrentNearCodePointer(); + bpi.address_host_reg = HostReg_Invalid; + bpi.value_host_reg = value.host_reg; + bpi.guest_pc = m_current_instruction->pc; + + a64::MemOperand actual_address; + if (address.IsConstant()) + { + m_emit->Mov(GetHostReg32(RSCRATCH), address.constant_value); + actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RSCRATCH)); + bpi.host_pc = GetCurrentNearCodePointer(); + } + else + { + actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(address)); + } + + switch (value.size) + { + case RegSize_8: + m_emit->Strb(GetHostReg8(value), actual_address); + break; + + case RegSize_16: + m_emit->Strh(GetHostReg16(value), actual_address); + break; + + case RegSize_32: + m_emit->Str(GetHostReg32(value), actual_address); + break; + + default: + UnreachableCode(); + break; + } + + bpi.host_code_size = static_cast( + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + + // generate slowmem fallback + bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + SwitchToFarCode(); + + EmitStoreGuestMemorySlowmem(cbi, address, value, true); + + // return to the block code + EmitBranch(GetCurrentNearCodePointer(), false); + + SwitchToNearCode(); + + m_block->loadstore_backpatch_info.push_back(bpi); +} + +void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, + const Value& value, bool in_far_code) +{ + AddPendingCycles(true); + if (g_settings.cpu_recompiler_memory_exceptions) { - Value result = m_register_cache.AllocateScratch(RegSize_32); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); + Assert(!in_far_code); + Value result = m_register_cache.AllocateScratch(RegSize_32); switch (value.size) { case RegSize_8: @@ -1475,7 +1618,8 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const m_emit->Bind(&store_okay); // store exception path - SwitchToFarCode(); + if (!in_far_code) + SwitchToFarCode(); // cause_bits = (result << 2) | BD | cop_n m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2); @@ -1484,15 +1628,14 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const static_cast(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n))); EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); - EmitExceptionExit(); + if (!in_far_code) + EmitExceptionExit(); SwitchToNearCode(); m_register_cache.PopState(); } else { - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - switch (value.size) { case RegSize_8: @@ -1514,6 +1657,30 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const } } +bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) +{ + Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem at %p", lbi.host_pc, lbi.guest_pc, lbi.host_slowmem_pc); + + // check jump distance + const s64 jump_distance = + static_cast(reinterpret_cast(lbi.host_slowmem_pc) - reinterpret_cast(lbi.host_pc)); + Assert(Common::IsAligned(jump_distance, 4)); + Assert(a64::Instruction::IsValidImmPCOffset(a64::UncondBranchType, jump_distance >> 2)); + + // turn it into a jump to the slowmem handler + vixl::aarch64::MacroAssembler emit(static_cast(lbi.host_pc), lbi.host_code_size, + a64::PositionDependentCode); + emit.b(jump_distance >> 2); + + const s32 nops = (static_cast(lbi.host_code_size) - static_cast(emit.GetCursorOffset())) / 4; + Assert(nops >= 0); + for (s32 i = 0; i < nops; i++) + emit.nop(); + + JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size); + return true; +} + void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { EmitLoadGlobalAddress(RSCRATCH, ptr); diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 5770b4f4f..cb42a4e30 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -1,4 +1,5 @@ #include "common/align.h" +#include "common/assert.h" #include "common/log.h" #include "cpu_core.h" #include "cpu_core_private.h" @@ -12,6 +13,7 @@ namespace CPU::Recompiler { #if defined(ABI_WIN64) constexpr HostReg RCPUPTR = Xbyak::Operand::RBP; +constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX; constexpr HostReg RRETURN = Xbyak::Operand::RAX; constexpr HostReg RARG1 = Xbyak::Operand::RCX; constexpr HostReg RARG2 = Xbyak::Operand::RDX; @@ -21,6 +23,7 @@ constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 32; constexpr u64 FUNCTION_CALL_STACK_ALIGNMENT = 16; #elif defined(ABI_SYSV) constexpr HostReg RCPUPTR = Xbyak::Operand::RBP; +constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX; constexpr HostReg RRETURN = Xbyak::Operand::RAX; constexpr HostReg RARG1 = Xbyak::Operand::RDI; constexpr HostReg RARG2 = Xbyak::Operand::RSI; @@ -79,6 +82,11 @@ static const Xbyak::Reg64 GetCPUPtrReg() return GetHostReg64(RCPUPTR); } +static const Xbyak::Reg64 GetFastmemBasePtrReg() +{ + return GetHostReg64(RMEMBASEPTR); +} + CodeGenerator::CodeGenerator(JitCodeBuffer* code_buffer) : m_code_buffer(code_buffer), m_register_cache(*this), m_near_emitter(code_buffer->GetFreeCodeSpace(), code_buffer->GetFreeCodePointer()), @@ -140,7 +148,6 @@ void CodeGenerator::InitHostRegs() m_register_cache.SetCalleeSavedHostRegs({Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::RDI, Xbyak::Operand::RSI, Xbyak::Operand::RSP, Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15}); - m_register_cache.SetCPUPtrHostReg(RCPUPTR); #elif defined(ABI_SYSV) m_register_cache.SetHostRegAllocationOrder( {Xbyak::Operand::RBX, /*Xbyak::Operand::RSP, */ Xbyak::Operand::RBP, Xbyak::Operand::R12, Xbyak::Operand::R13, @@ -154,8 +161,9 @@ void CodeGenerator::InitHostRegs() m_register_cache.SetCalleeSavedHostRegs({Xbyak::Operand::RBX, Xbyak::Operand::RSP, Xbyak::Operand::RBP, Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15}); - m_register_cache.SetCPUPtrHostReg(RCPUPTR); #endif + + m_register_cache.SetCPUPtrHostReg(RCPUPTR); } void CodeGenerator::SwitchToFarCode() @@ -196,11 +204,22 @@ void CodeGenerator::EmitBeginBlock() const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); // m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); + + // If there's loadstore instructions, preload the fastmem base. + if (m_block->contains_loadstore_instructions) + { + const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); + Assert(fastmem_reg_allocated); + m_emit->mov(GetFastmemBasePtrReg(), m_emit->qword[GetCPUPtrReg() + offsetof(CPU::State, fastmem_base)]); + } } void CodeGenerator::EmitEndBlock() { m_register_cache.FreeHostReg(RCPUPTR); + if (m_block->contains_loadstore_instructions) + m_register_cache.FreeHostReg(RMEMBASEPTR); + m_register_cache.PopCalleeSavedRegisters(true); m_emit->ret(); @@ -1762,12 +1781,139 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const AddPendingCycles(true); + Value result = m_register_cache.AllocateScratch(RegSize_64); + if (g_settings.IsUsingFastmem()) + { + EmitLoadGuestMemoryFastmem(cbi, address, size, result); + } + else + { + m_register_cache.FlushCallerSavedGuestRegisters(true, true); + EmitLoadGuestMemorySlowmem(cbi, address, size, result, false); + } + + // Downcast to ignore upper 56/48/32 bits. This should be a noop. + switch (size) + { + case RegSize_8: + ConvertValueSizeInPlace(&result, RegSize_8, false); + break; + + case RegSize_16: + ConvertValueSizeInPlace(&result, RegSize_16, false); + break; + + case RegSize_32: + ConvertValueSizeInPlace(&result, RegSize_32, false); + break; + + default: + UnreachableCode(); + break; + } + + return result; +} + +void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, + Value& result) +{ + // fastmem + LoadStoreBackpatchInfo bpi; + bpi.host_pc = GetCurrentNearCodePointer(); + bpi.address_host_reg = HostReg_Invalid; + bpi.value_host_reg = result.host_reg; + bpi.guest_pc = m_current_instruction->pc; + + // can't store displacements > 0x80000000 in-line + const Value* actual_address = &address; + if (address.IsConstant() && address.constant_value >= 0x80000000) + { + actual_address = &result; + m_emit->mov(GetHostReg32(result.host_reg), address.constant_value); + bpi.host_pc = GetCurrentNearCodePointer(); + } + + // TODO: movsx/zx inline here + switch (size) + { + case RegSize_8: + { + if (actual_address->IsConstant()) + { + m_emit->mov(GetHostReg8(result.host_reg), + m_emit->byte[GetFastmemBasePtrReg() + actual_address->constant_value]); + } + else + { + m_emit->mov(GetHostReg8(result.host_reg), + m_emit->byte[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)]); + } + } + break; + + case RegSize_16: + { + if (actual_address->IsConstant()) + { + m_emit->mov(GetHostReg16(result.host_reg), + m_emit->word[GetFastmemBasePtrReg() + actual_address->constant_value]); + } + else + { + m_emit->mov(GetHostReg16(result.host_reg), + m_emit->word[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)]); + } + } + break; + + case RegSize_32: + { + if (actual_address->IsConstant()) + { + m_emit->mov(GetHostReg32(result.host_reg), + m_emit->dword[GetFastmemBasePtrReg() + actual_address->constant_value]); + } + else + { + m_emit->mov(GetHostReg32(result.host_reg), + m_emit->dword[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)]); + } + } + break; + } + + // TODO: BIOS reads... + EmitAddCPUStructField(offsetof(CPU::State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS)); + + // insert nops, we need at least 5 bytes for a relative jump + const u32 fastmem_size = + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc)); + const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0); + for (u32 i = 0; i < nops; i++) + m_emit->nop(); + + bpi.host_code_size = static_cast( + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + + // generate slowmem fallback + bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + SwitchToFarCode(); + EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); + + // return to the block code + m_emit->jmp(GetCurrentNearCodePointer()); + + SwitchToNearCode(); + + m_block->loadstore_backpatch_info.push_back(bpi); +} + +void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, + Value& result, bool in_far_code) +{ if (g_settings.cpu_recompiler_memory_exceptions) { - // We need to use the full 64 bits here since we test the sign bit result. - Value result = m_register_cache.AllocateScratch(RegSize_64); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - // NOTE: This can leave junk in the upper bits switch (size) { @@ -1794,7 +1940,8 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const m_register_cache.PushState(); // load exception path - SwitchToFarCode(); + if (!in_far_code) + SwitchToFarCode(); // cause_bits = (-result << 2) | BD | cop_n m_emit->neg(GetHostReg32(result.host_reg)); @@ -1805,37 +1952,14 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); EmitExceptionExit(); - SwitchToNearCode(); + + if (!in_far_code) + SwitchToNearCode(); m_register_cache.PopState(); - - // Downcast to ignore upper 56/48/32 bits. This should be a noop. - switch (size) - { - case RegSize_8: - ConvertValueSizeInPlace(&result, RegSize_8, false); - break; - - case RegSize_16: - ConvertValueSizeInPlace(&result, RegSize_16, false); - break; - - case RegSize_32: - ConvertValueSizeInPlace(&result, RegSize_32, false); - break; - - default: - UnreachableCode(); - break; - } - - return result; } else { - Value result = m_register_cache.AllocateScratch(RegSize_32); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - switch (size) { case RegSize_8: @@ -1854,27 +1978,6 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const UnreachableCode(); break; } - - // Downcast to ignore upper 56/48/32 bits. This should be a noop. - switch (size) - { - case RegSize_8: - ConvertValueSizeInPlace(&result, RegSize_8, false); - break; - - case RegSize_16: - ConvertValueSizeInPlace(&result, RegSize_16, false); - break; - - case RegSize_32: - break; - - default: - UnreachableCode(); - break; - } - - return result; } } @@ -1895,11 +1998,163 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const AddPendingCycles(true); + if (g_settings.IsUsingFastmem()) + { + EmitStoreGuestMemoryFastmem(cbi, address, value); + } + else + { + m_register_cache.FlushCallerSavedGuestRegisters(true, true); + EmitStoreGuestMemorySlowmem(cbi, address, value, false); + } +} + +void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, + const Value& value) +{ + // fastmem + LoadStoreBackpatchInfo bpi; + bpi.host_pc = GetCurrentNearCodePointer(); + bpi.address_host_reg = HostReg_Invalid; + bpi.value_host_reg = value.host_reg; + bpi.guest_pc = m_current_instruction->pc; + + // can't store displacements > 0x80000000 in-line + const Value* actual_address = &address; + Value temp_address; + if (address.IsConstant() && address.constant_value >= 0x80000000) + { + temp_address.SetHostReg(&m_register_cache, RRETURN, RegSize_32); + actual_address = &temp_address; + m_emit->mov(GetHostReg32(temp_address), address.constant_value); + bpi.host_pc = GetCurrentNearCodePointer(); + } + + switch (value.size) + { + case RegSize_8: + { + if (actual_address->IsConstant()) + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + actual_address->constant_value], value.constant_value); + } + else + { + m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + actual_address->constant_value], + GetHostReg8(value.host_reg)); + } + } + else + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + value.constant_value); + } + else + { + m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + GetHostReg8(value.host_reg)); + } + } + } + break; + + case RegSize_16: + { + if (actual_address->IsConstant()) + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + actual_address->constant_value], value.constant_value); + } + else + { + m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + actual_address->constant_value], + GetHostReg16(value.host_reg)); + } + } + else + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + value.constant_value); + } + else + { + m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + GetHostReg16(value.host_reg)); + } + } + } + break; + + case RegSize_32: + { + if (actual_address->IsConstant()) + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + actual_address->constant_value], value.constant_value); + } + else + { + m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + actual_address->constant_value], + GetHostReg32(value.host_reg)); + } + } + else + { + if (value.IsConstant()) + { + m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + value.constant_value); + } + else + { + m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)], + GetHostReg32(value.host_reg)); + } + } + } + break; + } + + // insert nops, we need at least 5 bytes for a relative jump + const u32 fastmem_size = + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc)); + const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0); + for (u32 i = 0; i < nops; i++) + m_emit->nop(); + + bpi.host_code_size = static_cast( + static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); + + // generate slowmem fallback + bpi.host_slowmem_pc = GetCurrentFarCodePointer(); + SwitchToFarCode(); + + EmitStoreGuestMemorySlowmem(cbi, address, value, true); + + // return to the block code + m_emit->jmp(GetCurrentNearCodePointer()); + + SwitchToNearCode(); + + m_block->loadstore_backpatch_info.push_back(bpi); +} + +void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, + const Value& value, bool in_far_code) +{ if (g_settings.cpu_recompiler_memory_exceptions) { - Value result = m_register_cache.AllocateScratch(RegSize_32); - m_register_cache.FlushCallerSavedGuestRegisters(true, true); + Assert(!in_far_code); + Value result = m_register_cache.AllocateScratch(RegSize_32); switch (value.size) { case RegSize_8: @@ -1925,24 +2180,24 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const m_emit->jnz(GetCurrentFarCodePointer()); // store exception path - SwitchToFarCode(); + if (!in_far_code) + SwitchToFarCode(); // cause_bits = (result << 2) | BD | cop_n - m_emit->shl(GetHostReg32(result.host_reg), 2); - m_emit->or_(GetHostReg32(result.host_reg), + m_emit->shl(GetHostReg32(result), 2); + m_emit->or_(GetHostReg32(result), Cop0Registers::CAUSE::MakeValueForException(static_cast(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n)); EmitFunctionCall(nullptr, static_cast(&CPU::RaiseException), result, GetCurrentInstructionPC()); EmitExceptionExit(); - SwitchToNearCode(); + if (!in_far_code) + SwitchToNearCode(); m_register_cache.PopState(); } else { - m_register_cache.FlushCallerSavedGuestRegisters(true, true); - switch (value.size) { case RegSize_8: @@ -1964,6 +2219,24 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const } } +bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) +{ + Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem", lbi.host_pc, lbi.guest_pc); + + // turn it into a jump to the slowmem handler + Xbyak::CodeGenerator cg(lbi.host_code_size, lbi.host_pc); + cg.jmp(lbi.host_slowmem_pc); + + const s32 nops = static_cast(lbi.host_code_size) - + static_cast(static_cast(cg.getCurr() - static_cast(lbi.host_pc))); + Assert(nops >= 0); + for (s32 i = 0; i < nops; i++) + cg.nop(); + + JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size); + return true; +} + void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { const s64 displacement = diff --git a/src/core/cpu_recompiler_thunks.h b/src/core/cpu_recompiler_thunks.h index f698a859d..b9f5ced77 100644 --- a/src/core/cpu_recompiler_thunks.h +++ b/src/core/cpu_recompiler_thunks.h @@ -32,6 +32,7 @@ void UncheckedWriteMemoryByte(u32 address, u8 value); void UncheckedWriteMemoryHalfWord(u32 address, u16 value); void UncheckedWriteMemoryWord(u32 address, u32 value); +void UpdateFastmemMapping(); } // namespace Recompiler::Thunks diff --git a/src/core/cpu_recompiler_types.h b/src/core/cpu_recompiler_types.h index 9bb224223..3a8f1bc3c 100644 --- a/src/core/cpu_recompiler_types.h +++ b/src/core/cpu_recompiler_types.h @@ -127,6 +127,16 @@ constexpr bool SHIFTS_ARE_IMPLICITLY_MASKED = false; #endif +struct LoadStoreBackpatchInfo +{ + void* host_pc; // pointer to instruction which will fault + void* host_slowmem_pc; // pointer to slowmem callback code + u32 host_code_size; // size of the fastmem load as well as the add for cycles + HostReg address_host_reg; // register containing the guest address to load/store + HostReg value_host_reg; // register containing the source/destination + PhysicalMemoryAddress guest_pc; +}; + } // namespace Recompiler } // namespace CPU diff --git a/src/core/host_interface.cpp b/src/core/host_interface.cpp index 600b94bd5..7dc243181 100644 --- a/src/core/host_interface.cpp +++ b/src/core/host_interface.cpp @@ -366,6 +366,7 @@ void HostInterface::SetDefaultSettings(SettingsInterface& si) si.SetStringValue("CPU", "ExecutionMode", Settings::GetCPUExecutionModeName(Settings::DEFAULT_CPU_EXECUTION_MODE)); si.SetBoolValue("CPU", "RecompilerMemoryExceptions", false); si.SetBoolValue("CPU", "ICache", false); + si.SetBoolValue("CPU", "Fastmem", true); si.SetStringValue("GPU", "Renderer", Settings::GetRendererName(Settings::DEFAULT_GPU_RENDERER)); si.SetIntValue("GPU", "ResolutionScale", 1); @@ -512,12 +513,13 @@ void HostInterface::CheckForSettingsChanges(const Settings& old_settings) if (g_settings.emulation_speed != old_settings.emulation_speed) System::UpdateThrottlePeriod(); - if (g_settings.cpu_execution_mode != old_settings.cpu_execution_mode) + if (g_settings.cpu_execution_mode != old_settings.cpu_execution_mode || + g_settings.cpu_fastmem != old_settings.cpu_fastmem) { - AddFormattedOSDMessage(5.0f, "Switching to %s CPU execution mode.", - Settings::GetCPUExecutionModeName(g_settings.cpu_execution_mode)); - CPU::CodeCache::SetUseRecompiler(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler); - CPU::CodeCache::Flush(); + AddFormattedOSDMessage(5.0f, "Switching to %s CPU execution mode%s.", + Settings::GetCPUExecutionModeName(g_settings.cpu_execution_mode), + g_settings.cpu_fastmem ? " (fastmem)" : ""); + CPU::CodeCache::Reinitialize(); CPU::ClearICache(); } diff --git a/src/core/settings.cpp b/src/core/settings.cpp index d62439e34..135e8a522 100644 --- a/src/core/settings.cpp +++ b/src/core/settings.cpp @@ -96,6 +96,7 @@ void Settings::Load(SettingsInterface& si) .value_or(DEFAULT_CPU_EXECUTION_MODE); cpu_recompiler_memory_exceptions = si.GetBoolValue("CPU", "RecompilerMemoryExceptions", false); cpu_recompiler_icache = si.GetBoolValue("CPU", "RecompilerICache", false); + cpu_fastmem = si.GetBoolValue("CPU", "Fastmem", true); gpu_renderer = ParseRendererName(si.GetStringValue("GPU", "Renderer", GetRendererName(DEFAULT_GPU_RENDERER)).c_str()) .value_or(DEFAULT_GPU_RENDERER); @@ -217,6 +218,7 @@ void Settings::Save(SettingsInterface& si) const si.SetStringValue("CPU", "ExecutionMode", GetCPUExecutionModeName(cpu_execution_mode)); si.SetBoolValue("CPU", "RecompilerMemoryExceptions", cpu_recompiler_memory_exceptions); si.SetBoolValue("CPU", "RecompilerICache", cpu_recompiler_icache); + si.SetBoolValue("CPU", "Fastmem", cpu_fastmem); si.SetStringValue("GPU", "Renderer", GetRendererName(gpu_renderer)); si.SetStringValue("GPU", "Adapter", gpu_adapter.c_str()); diff --git a/src/core/settings.h b/src/core/settings.h index 08373fd1c..b1dfa5978 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -72,6 +72,7 @@ struct Settings CPUExecutionMode cpu_execution_mode = CPUExecutionMode::Interpreter; bool cpu_recompiler_memory_exceptions = false; bool cpu_recompiler_icache = false; + bool cpu_fastmem = true; float emulation_speed = 1.0f; bool speed_limiter_enabled = true; @@ -172,6 +173,11 @@ struct Settings return gpu_pgxp_enable ? (gpu_pgxp_cpu ? PGXPMode::CPU : PGXPMode::Memory) : PGXPMode::Disabled; } + ALWAYS_INLINE bool IsUsingFastmem() const + { + return (cpu_fastmem && cpu_execution_mode == CPUExecutionMode::Recompiler && !cpu_recompiler_memory_exceptions); + } + bool HasAnyPerGameMemoryCards() const; enum : u32 diff --git a/src/core/system.cpp b/src/core/system.cpp index b861fada4..fd2965da6 100644 --- a/src/core/system.cpp +++ b/src/core/system.cpp @@ -708,14 +708,16 @@ bool Initialize(bool force_software_renderer) TimingEvents::Initialize(); CPU::Initialize(); - CPU::CodeCache::Initialize(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler); - Bus::Initialize(); + + if (!Bus::Initialize()) + return false; + + CPU::CodeCache::Initialize(); if (!CreateGPU(force_software_renderer ? GPURenderer::Software : g_settings.gpu_renderer)) return false; g_dma.Initialize(); - g_interrupt_controller.Initialize(); g_cdrom.Initialize(); diff --git a/src/core/types.h b/src/core/types.h index c1a281064..8f896dad2 100644 --- a/src/core/types.h +++ b/src/core/types.h @@ -129,6 +129,6 @@ enum : u32 enum : u32 { - CPU_CODE_CACHE_PAGE_SIZE = 1024, + CPU_CODE_CACHE_PAGE_SIZE = 4096, CPU_CODE_CACHE_PAGE_COUNT = 0x200000 / CPU_CODE_CACHE_PAGE_SIZE }; From dc2840daa263ab168b952d051df905d83017a3ef Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 6 Sep 2020 22:55:00 +1000 Subject: [PATCH 8/8] Support fastmem on scratchpad Disabled due to incorrect access time --- src/core/bus.cpp | 71 +++++++++++++++++++++++++++++++++---------- src/core/bus.h | 12 ++++++-- src/core/cpu_core.cpp | 1 - src/core/cpu_core.h | 2 -- 4 files changed, 64 insertions(+), 22 deletions(-) diff --git a/src/core/bus.cpp b/src/core/bus.cpp index 340bca2c5..6e50e5055 100644 --- a/src/core/bus.cpp +++ b/src/core/bus.cpp @@ -70,8 +70,9 @@ union MEMCTRL }; std::bitset m_ram_code_bits{}; -u8* g_ram = nullptr; // 2MB RAM +u8* g_ram = nullptr; // 2MB RAM u8* g_bios = nullptr; // 512K BIOS ROM +u8* g_scratchpad = nullptr; static std::array m_exp1_access_time = {}; static std::array m_exp2_access_time = {}; @@ -89,11 +90,15 @@ static std::string m_tty_line_buffer; static Common::MemoryArena m_memory_arena; static u8* m_fastmem_base = nullptr; static std::vector m_fastmem_ram_views; +static std::vector m_fastmem_scratchpad_views; +static std::vector m_fastmem_bios_views; static std::tuple CalculateMemoryTiming(MEMDELAY mem_delay, COMDELAY common_delay); static void RecalculateMemoryTimings(); static void SetCodePageFastmemProtection(u32 page_index, bool writable); +static bool AllocateMemory(); +static void UnmapFastmemViews(); #define FIXUP_WORD_READ_OFFSET(offset) ((offset) & ~u32(3)) #define FIXUP_WORD_READ_VALUE(offset, value) ((value) >> (((offset)&u32(3)) * 8u)) @@ -124,11 +129,13 @@ bool Initialize() void Shutdown() { - m_fastmem_ram_views.clear(); + UnmapFastmemViews(); if (g_ram) m_memory_arena.ReleaseViewPtr(g_ram, RAM_SIZE); if (g_bios) m_memory_arena.ReleaseViewPtr(g_bios, BIOS_SIZE); + if (g_scratchpad) + m_memory_arena.ReleaseViewPtr(g_scratchpad, FASTMEM_SCRATCHPAD_SIZE); CPU::g_state.fastmem_base = nullptr; } @@ -136,6 +143,7 @@ void Shutdown() void Reset() { std::memset(g_ram, 0, RAM_SIZE); + std::memset(g_scratchpad, 0, SCRATCHPAD_SIZE); m_MEMCTRL.exp1_base = 0x1F000000; m_MEMCTRL.exp2_base = 0x1F802000; m_MEMCTRL.exp1_delay_size.bits = 0x0013243F; @@ -159,6 +167,7 @@ bool DoState(StateWrapper& sw) sw.Do(&m_spu_access_time); sw.DoBytes(g_ram, RAM_SIZE); sw.DoBytes(g_bios, BIOS_SIZE); + sw.DoBytes(g_scratchpad, SCRATCHPAD_SIZE); sw.DoArray(m_MEMCTRL.regs, countof(m_MEMCTRL.regs)); sw.Do(&m_ram_size_reg); sw.Do(&m_tty_line_buffer); @@ -248,6 +257,8 @@ bool AllocateMemory() // Create the base views. g_ram = static_cast(m_memory_arena.CreateViewPtr(MEMORY_ARENA_RAM_OFFSET, RAM_SIZE, true, false)); g_bios = static_cast(m_memory_arena.CreateViewPtr(MEMORY_ARENA_BIOS_OFFSET, BIOS_SIZE, true, false)); + g_scratchpad = static_cast( + m_memory_arena.CreateViewPtr(MEMORY_ARENA_SCRATCHPAD_OFFSET, FASTMEM_SCRATCHPAD_SIZE, true, false)); if (!g_ram || !g_bios) { Log_ErrorPrint("Failed to create base views of memory"); @@ -257,9 +268,16 @@ bool AllocateMemory() return true; } -void UpdateFastmemViews(bool enabled, bool isolate_cache) +void UnmapFastmemViews() { m_fastmem_ram_views.clear(); + m_fastmem_scratchpad_views.clear(); + m_fastmem_bios_views.clear(); +} + +void UpdateFastmemViews(bool enabled, bool isolate_cache) +{ + UnmapFastmemViews(); if (!enabled) { m_fastmem_base = nullptr; @@ -296,15 +314,33 @@ void UpdateFastmemViews(bool enabled, bool isolate_cache) { u8* page_address = map_address + (i * CPU_CODE_CACHE_PAGE_SIZE); if (!m_memory_arena.SetPageProtection(page_address, CPU_CODE_CACHE_PAGE_SIZE, true, false, false)) - { Log_ErrorPrintf("Failed to write-protect code page at %p"); - return; - } } } m_fastmem_ram_views.push_back(std::move(view.value())); }; + auto MapScratchpad = [](u32 base_address) { + u8* map_address = m_fastmem_base + base_address; + auto view = + m_memory_arena.CreateView(MEMORY_ARENA_SCRATCHPAD_OFFSET, FASTMEM_SCRATCHPAD_SIZE, true, false, map_address); + if (!view) + { + Log_ErrorPrintf("Failed to map scratchpad at fastmem area %p (offset 0x%08X)", map_address, + FASTMEM_SCRATCHPAD_SIZE); + return; + } + + // mark all pages beyond the first as inaccessible + // we need to do this because of windows's stupidity with its 64K mapping granularity + if (!m_memory_arena.SetPageProtection(map_address + CPU_CODE_CACHE_PAGE_SIZE, + FASTMEM_SCRATCHPAD_SIZE - CPU_CODE_CACHE_PAGE_SIZE, false, false, false)) + { + Log_ErrorPrintf("Failed to read/write protect scratchpad"); + } + + m_fastmem_scratchpad_views.push_back(std::move(view.value())); + }; auto MapBIOS = [](u32 base_address) { u8* map_address = m_fastmem_base + base_address; auto view = m_memory_arena.CreateView(MEMORY_ARENA_BIOS_OFFSET, BIOS_SIZE, false, false, map_address); @@ -314,17 +350,19 @@ void UpdateFastmemViews(bool enabled, bool isolate_cache) return; } - m_fastmem_ram_views.push_back(std::move(view.value())); + m_fastmem_bios_views.push_back(std::move(view.value())); }; if (!isolate_cache) { // KUSEG - cached MapRAM(0x00000000); + // MapScratchpad(0x1F800000); // MapBIOS(0x1FC00000); // KSEG0 - cached MapRAM(0x80000000); + // MapScratchpad(0x9F800000); // MapBIOS(0x9FC00000); } @@ -411,7 +449,6 @@ bool HasCodePagesInRange(PhysicalMemoryAddress start_address, u32 size) return false; } - static TickCount DoInvalidAccess(MemoryAccessType type, MemoryAccessSize size, PhysicalMemoryAddress address, u32& value) { @@ -1089,34 +1126,36 @@ static void WriteCacheControl(u32 value) template ALWAYS_INLINE static TickCount DoScratchpadAccess(PhysicalMemoryAddress address, u32& value) { + using namespace Bus; + const PhysicalMemoryAddress cache_offset = address & DCACHE_OFFSET_MASK; if constexpr (size == MemoryAccessSize::Byte) { if constexpr (type == MemoryAccessType::Read) - value = ZeroExtend32(g_state.dcache[cache_offset]); + value = ZeroExtend32(g_scratchpad[cache_offset]); else - g_state.dcache[cache_offset] = Truncate8(value); + g_scratchpad[cache_offset] = Truncate8(value); } else if constexpr (size == MemoryAccessSize::HalfWord) { if constexpr (type == MemoryAccessType::Read) { u16 temp; - std::memcpy(&temp, &g_state.dcache[cache_offset], sizeof(temp)); + std::memcpy(&temp, &g_scratchpad[cache_offset], sizeof(temp)); value = ZeroExtend32(temp); } else { u16 temp = Truncate16(value); - std::memcpy(&g_state.dcache[cache_offset], &temp, sizeof(temp)); + std::memcpy(&g_scratchpad[cache_offset], &temp, sizeof(temp)); } } else if constexpr (size == MemoryAccessSize::Word) { if constexpr (type == MemoryAccessType::Read) - std::memcpy(&value, &g_state.dcache[cache_offset], sizeof(value)); + std::memcpy(&value, &g_scratchpad[cache_offset], sizeof(value)); else - std::memcpy(&g_state.dcache[cache_offset], &value, sizeof(value)); + std::memcpy(&g_scratchpad[cache_offset], &value, sizeof(value)); } return 0; @@ -1524,7 +1563,7 @@ void* GetDirectReadMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize if (read_ticks) *read_ticks = 0; - return &g_state.dcache[paddr & DCACHE_OFFSET_MASK]; + return &g_scratchpad[paddr & DCACHE_OFFSET_MASK]; } if (paddr >= BIOS_BASE && paddr < (BIOS_BASE + BIOS_SIZE)) @@ -1555,7 +1594,7 @@ void* GetDirectWriteMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize #endif if ((paddr & DCACHE_LOCATION_MASK) == DCACHE_LOCATION) - return &g_state.dcache[paddr & DCACHE_OFFSET_MASK]; + return &g_scratchpad[paddr & DCACHE_OFFSET_MASK]; return nullptr; } diff --git a/src/core/bus.h b/src/core/bus.h index ef1905bee..921a68931 100644 --- a/src/core/bus.h +++ b/src/core/bus.h @@ -20,6 +20,9 @@ enum : u32 EXP1_BASE = 0x1F000000, EXP1_SIZE = 0x800000, EXP1_MASK = EXP1_SIZE - 1, + SCRATCHPAD_BASE = 0x1F800000, + SCRATCHPAD_SIZE = 0x400, + SCRATCHPAD_MASK = SCRATCHPAD_SIZE - 1, MEMCTRL_BASE = 0x1F801000, MEMCTRL_SIZE = 0x40, MEMCTRL_MASK = MEMCTRL_SIZE - 1, @@ -73,12 +76,15 @@ enum : TickCount enum : size_t { + FASTMEM_SCRATCHPAD_SIZE = 0x10000, + // Our memory arena contains storage for RAM and BIOS. - MEMORY_ARENA_SIZE = RAM_SIZE + BIOS_SIZE, + MEMORY_ARENA_SIZE = RAM_SIZE + FASTMEM_SCRATCHPAD_SIZE + BIOS_SIZE, // Offsets within the memory arena. MEMORY_ARENA_RAM_OFFSET = 0, - MEMORY_ARENA_BIOS_OFFSET = MEMORY_ARENA_RAM_OFFSET + RAM_SIZE, + MEMORY_ARENA_SCRATCHPAD_OFFSET = MEMORY_ARENA_RAM_OFFSET + RAM_SIZE, + MEMORY_ARENA_BIOS_OFFSET = MEMORY_ARENA_SCRATCHPAD_OFFSET + FASTMEM_SCRATCHPAD_SIZE, // Fastmem region size is 4GB to cover the entire 32-bit address space. FASTMEM_REGION_SIZE = UINT64_C(0x100000000) @@ -90,7 +96,6 @@ void Reset(); bool DoState(StateWrapper& sw); u8* GetFastmemBase(); -bool AllocateMemory(); void UpdateFastmemViews(bool enabled, bool isolate_cache); void SetExpansionROM(std::vector data); @@ -99,6 +104,7 @@ void SetBIOS(const std::vector& image); extern std::bitset m_ram_code_bits; extern u8* g_ram; // 2MB RAM extern u8* g_bios; // 512K BIOS ROM +extern u8* g_scratchpad; // 1KB scratchpad as 4K (in fastmem) /// Returns true if the address specified is writable (RAM). ALWAYS_INLINE static bool IsRAMAddress(PhysicalMemoryAddress address) diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index a26e8cc60..6b5ca23fc 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -122,7 +122,6 @@ bool DoState(StateWrapper& sw) sw.Do(&g_state.next_load_delay_reg); sw.Do(&g_state.next_load_delay_value); sw.Do(&g_state.cache_control.bits); - sw.DoBytes(g_state.dcache.data(), g_state.dcache.size()); if (!GTE::DoState(sw)) return false; diff --git a/src/core/cpu_core.h b/src/core/cpu_core.h index 58d396d30..24bb4b442 100644 --- a/src/core/cpu_core.h +++ b/src/core/cpu_core.h @@ -81,8 +81,6 @@ struct State u8* fastmem_base = nullptr; - // data cache (used as scratchpad) - std::array dcache = {}; std::array icache_tags = {}; std::array icache_data = {}; };