diff --git a/src/core/bus.h b/src/core/bus.h index 10c44f90e..d2f187ba6 100644 --- a/src/core/bus.h +++ b/src/core/bus.h @@ -1,6 +1,5 @@ #pragma once #include "common/bitfield.h" -#include "cpu_code_cache.h" #include "types.h" #include #include @@ -97,16 +96,4 @@ ALWAYS_INLINE TickCount GetDMARAMTickCount(u32 word_count) return static_cast(word_count + ((word_count + 15) / 16)); } -/// Invalidates any code pages which overlap the specified range. -ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_count) -{ - const u32 start_page = address / CPU_CODE_CACHE_PAGE_SIZE; - const u32 end_page = (address + word_count * sizeof(u32)) / CPU_CODE_CACHE_PAGE_SIZE; - for (u32 page = start_page; page <= end_page; page++) - { - if (m_ram_code_bits[page]) - CPU::CodeCache::InvalidateBlocksWithPageIndex(page); - } -} - } // namespace Bus diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index f6bf291f2..e4a1a55e5 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -35,14 +35,8 @@ alignas(Recompiler::CODE_STORAGE_ALIGNMENT) static u8 static JitCodeBuffer s_code_buffer; -enum : u32 -{ - FAST_MAP_RAM_SLOT_COUNT = Bus::RAM_SIZE / 4, - FAST_MAP_BIOS_SLOT_COUNT = Bus::BIOS_SIZE / 4, - FAST_MAP_TOTAL_SLOT_COUNT = FAST_MAP_RAM_SLOT_COUNT + FAST_MAP_BIOS_SLOT_COUNT, -}; - std::array s_fast_map; +CodeBlock::HostCodePointer s_asm_dispatcher; ALWAYS_INLINE static u32 GetFastMapIndex(u32 pc) { @@ -51,6 +45,7 @@ ALWAYS_INLINE static u32 GetFastMapIndex(u32 pc) ((pc & Bus::RAM_MASK) >> 2); } +static void CompileDispatcher(); static void FastCompileBlockFunction(); static void ResetFastMap() @@ -111,6 +106,7 @@ void Initialize(bool use_recompiler) } ResetFastMap(); + CompileDispatcher(); #else s_use_recompiler = false; #endif @@ -238,9 +234,21 @@ void Execute() #ifdef WITH_RECOMPILER +void CompileDispatcher() +{ + Recompiler::CodeGenerator cg(&s_code_buffer); + s_asm_dispatcher = cg.CompileDispatcher(); +} + +CodeBlock::HostCodePointer* GetFastMapPointer() +{ + return s_fast_map.data(); +} + void ExecuteRecompiler() { g_state.frame_done = false; +#if 0 while (!g_state.frame_done) { if (HasPendingInterrupt()) @@ -261,6 +269,9 @@ void ExecuteRecompiler() TimingEvents::RunEvents(); } +#else + s_asm_dispatcher(); +#endif // in case we switch to interpreter... g_state.regs.npc = g_state.regs.pc; @@ -291,6 +302,7 @@ void Flush() #ifdef WITH_RECOMPILER s_code_buffer.Reset(); ResetFastMap(); + CompileDispatcher(); #endif } diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h index 068e6706e..5d285191b 100644 --- a/src/core/cpu_code_cache.h +++ b/src/core/cpu_code_cache.h @@ -1,4 +1,5 @@ #pragma once +#include "bus.h" #include "common/bitfield.h" #include "common/jit_code_buffer.h" #include "cpu_types.h" @@ -9,6 +10,13 @@ namespace CPU { +enum : u32 +{ + FAST_MAP_RAM_SLOT_COUNT = Bus::RAM_SIZE / 4, + FAST_MAP_BIOS_SLOT_COUNT = Bus::BIOS_SIZE / 4, + FAST_MAP_TOTAL_SLOT_COUNT = FAST_MAP_RAM_SLOT_COUNT + FAST_MAP_BIOS_SLOT_COUNT, +}; + union CodeBlockKey { u32 bits; @@ -86,6 +94,7 @@ void Shutdown(); void Execute(); #ifdef WITH_RECOMPILER +CodeBlock::HostCodePointer* GetFastMapPointer(); void ExecuteRecompiler(); #endif @@ -102,6 +111,18 @@ template void InterpretCachedBlock(const CodeBlock& block); void InterpretUncachedBlock(); +/// Invalidates any code pages which overlap the specified range. +ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_count) +{ + const u32 start_page = address / CPU_CODE_CACHE_PAGE_SIZE; + const u32 end_page = (address + word_count * sizeof(u32)) / CPU_CODE_CACHE_PAGE_SIZE; + for (u32 page = start_page; page <= end_page; page++) + { + if (Bus::m_ram_code_bits[page]) + CPU::CodeCache::InvalidateBlocksWithPageIndex(page); + } +} + }; // namespace CodeCache } // namespace CPU diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 463d64b5a..0d88e465b 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -1381,6 +1381,7 @@ void DispatchInterrupt() { // If the instruction we're about to execute is a GTE instruction, delay dispatching the interrupt until the next // instruction. For some reason, if we don't do this, we end up with incorrectly sorted polygons and flickering.. + SafeReadInstruction(g_state.regs.pc, &g_state.next_instruction.bits); if (g_state.next_instruction.op == InstructionOp::cop2 && !g_state.next_instruction.cop.IsCommonInstruction()) GTE::ExecuteInstruction(g_state.next_instruction.bits); diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index 438786bd3..699c50d99 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -25,6 +25,8 @@ public: bool CompileBlock(const CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); + CodeBlock::HostCodePointer CompileDispatcher(); + ////////////////////////////////////////////////////////////////////////// // Code Generation ////////////////////////////////////////////////////////////////////////// @@ -67,6 +69,7 @@ public: void EmitAddCPUStructField(u32 offset, const Value& value); void EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr); void EmitStoreGlobal(void* ptr, const Value& value); + void EmitLoadGlobalAddress(HostReg host_reg, const void* ptr); // Automatically generates an exception handler. Value EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, RegSize size); diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 7d772c099..c262e35e5 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -6,6 +6,7 @@ #include "cpu_recompiler_code_generator.h" #include "cpu_recompiler_thunks.h" #include "settings.h" +#include "timing_event.h" Log_SetChannel(CPU::Recompiler); namespace a64 = vixl::aarch64; @@ -26,6 +27,16 @@ constexpr u64 FUNCTION_CALLER_SAVED_SPACE_RESERVE = 144; // 18 registers -> 224 constexpr u64 FUNCTION_STACK_SIZE = FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE + FUNCTION_CALL_SHADOW_SPACE; +// PC we return to after the end of the block +static void* s_dispatcher_return_address; + +static s64 GetPCDisplacement(const void* current, const void* target) +{ + Assert(Common::IsAlignedPow2(reinterpret_cast(current), 4)); + Assert(Common::IsAlignedPow2(reinterpret_cast(target), 4)); + return static_cast((reinterpret_cast(target) - reinterpret_cast(current)) >> 2); +} + static const a64::WRegister GetHostReg8(HostReg reg) { return a64::WRegister(reg); @@ -172,11 +183,11 @@ void CodeGenerator::EmitBeginBlock() // Save the link register, since we'll be calling functions. const bool link_reg_allocated = m_register_cache.AllocateHostReg(30); DebugAssert(link_reg_allocated); + m_register_cache.AssumeCalleeSavedRegistersAreSaved(); // Store the CPU struct pointer. TODO: make this better. const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); - m_emit->Mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); } void CodeGenerator::EmitEndBlock() @@ -185,6 +196,7 @@ void CodeGenerator::EmitEndBlock() m_register_cache.PopCalleeSavedRegisters(true); m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + // m_emit->b(GetPCDisplacement(GetCurrentCodePointer(), s_dispatcher_return_address)); m_emit->Ret(); } @@ -200,6 +212,7 @@ void CodeGenerator::EmitExceptionExit() m_register_cache.PopCalleeSavedRegisters(false); m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + // m_emit->b(GetPCDisplacement(GetCurrentCodePointer(), s_dispatcher_return_address)); m_emit->Ret(); } @@ -958,13 +971,6 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size) m_register_cache.PopCallerSavedRegisters(); } -static s64 GetBranchDisplacement(const void* current, const void* target) -{ - Assert(Common::IsAlignedPow2(reinterpret_cast(current), 4)); - Assert(Common::IsAlignedPow2(reinterpret_cast(target), 4)); - return static_cast((reinterpret_cast(target) - reinterpret_cast(current)) >> 2); -} - void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) { if (return_value) @@ -974,7 +980,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) const u32 adjust_size = PrepareStackForCall(); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1009,7 +1015,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG1, arg1); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1045,7 +1051,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG2, arg2); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1083,7 +1089,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG3, arg3); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1122,7 +1128,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG4, arg4); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); const bool use_blr = !vixl::IsInt26(displacement); if (use_blr) { @@ -1510,7 +1516,7 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); + EmitLoadGlobalAddress(RSCRATCH, ptr); switch (size) { case RegSize_8: @@ -1535,7 +1541,7 @@ void CodeGenerator::EmitStoreGlobal(void* ptr, const Value& value) { Value value_in_hr = GetValueInHostRegister(value); - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); + EmitLoadGlobalAddress(RSCRATCH, ptr); switch (value.size) { case RegSize_8: @@ -1882,4 +1888,130 @@ void CodeGenerator::EmitBindLabel(LabelType* label) m_emit->Bind(label); } +void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr) +{ + const void* current_code_ptr_page = reinterpret_cast( + reinterpret_cast(GetCurrentCodePointer()) & ~static_cast(0xFFF)); + const void* ptr_page = + reinterpret_cast(reinterpret_cast(ptr) & ~static_cast(0xFFF)); + const s64 page_displacement = GetPCDisplacement(current_code_ptr_page, ptr_page) >> 10; + const u32 page_offset = static_cast(reinterpret_cast(ptr) & 0xFFFu); + if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmLogical(page_offset, 64)) + { + m_emit->adrp(GetHostReg64(host_reg), page_displacement); + m_emit->orr(GetHostReg64(host_reg), GetHostReg64(host_reg), page_offset); + } + else + { + m_emit->Mov(GetHostReg64(host_reg), reinterpret_cast(ptr)); + } +} + +CodeBlock::HostCodePointer CodeGenerator::CompileDispatcher() +{ + m_emit->Sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + m_register_cache.ReserveCallerSavedRegisters(); + + EmitLoadGlobalAddress(RCPUPTR, &g_state); + + a64::Label frame_done_loop; + a64::Label exit_dispatcher; + m_emit->Bind(&frame_done_loop); + + // if frame_done goto exit_dispatcher + m_emit->ldrb(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, frame_done))); + m_emit->tbnz(a64::w8, 0, &exit_dispatcher); + + // x8 <- sr + a64::Label no_interrupt; + m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, cop0_regs.sr.bits))); + + // if Iec == 0 then goto no_interrupt + m_emit->tbz(a64::w8, 0, &no_interrupt); + + // x9 <- cause + // x8 (sr) & cause + m_emit->ldr(a64::w9, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, cop0_regs.cause.bits))); + m_emit->and_(a64::w8, a64::w8, a64::w9); + + // ((sr & cause) & 0xff00) == 0 goto no_interrupt + m_emit->tst(a64::w8, 0xFF00); + m_emit->b(&no_interrupt, a64::eq); + + // we have an interrupt + EmitFunctionCall(nullptr, &DispatchInterrupt); + + // no interrupt or we just serviced it + m_emit->Bind(&no_interrupt); + + // TimingEvents::UpdateCPUDowncount: + // x8 <- head event->downcount + // downcount <- x8 + EmitLoadGlobalAddress(8, TimingEvents::GetHeadEventPtr()); + m_emit->ldr(a64::x8, a64::MemOperand(a64::x8)); + m_emit->ldr(a64::w8, a64::MemOperand(a64::x8, offsetof(TimingEvent, m_downcount))); + m_emit->str(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, downcount))); + + // main dispatch loop + a64::Label main_loop; + m_emit->Bind(&main_loop); + s_dispatcher_return_address = GetCurrentCodePointer(); + + // w8 <- pending_ticks + // w9 <- downcount + m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, pending_ticks))); + m_emit->ldr(a64::w9, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, downcount))); + + // while downcount < pending_ticks + a64::Label downcount_hit; + m_emit->cmp(a64::w8, a64::w9); + m_emit->b(&downcount_hit, a64::ge); + + // time to lookup the block + // w8 <- pc + m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, regs.pc))); + + // w9 <- (pc & RAM_MASK) >> 2 + m_emit->and_(a64::w9, a64::w8, Bus::RAM_MASK); + m_emit->lsr(a64::w9, a64::w9, 2); + + // w10 <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT + m_emit->and_(a64::w10, a64::w8, Bus::BIOS_MASK); + m_emit->lsr(a64::w10, a64::w10, 2); + m_emit->add(a64::w10, a64::w10, FAST_MAP_RAM_SLOT_COUNT); + + // current_instruction_pc <- pc (eax) + m_emit->str(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, current_instruction_pc))); + + // if ((w8 (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use w10 as index } + m_emit->and_(a64::w8, a64::w8, PHYSICAL_MEMORY_ADDRESS_MASK); + m_emit->Mov(a64::w11, Bus::BIOS_BASE); + m_emit->cmp(a64::w8, a64::w11); + m_emit->csel(a64::w8, a64::w9, a64::w10, a64::lt); + + // ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue + EmitLoadGlobalAddress(9, CodeCache::GetFastMapPointer()); + m_emit->ldr(a64::x8, a64::MemOperand(a64::x9, a64::x8, a64::LSL, 3)); + m_emit->blr(a64::x8); + + // end while + m_emit->Bind(&downcount_hit); + + // check events then for frame done + EmitFunctionCall(nullptr, &TimingEvents::RunEvents); + m_emit->b(&frame_done_loop); + + // all done + m_emit->Bind(&exit_dispatcher); + m_register_cache.PopCalleeSavedRegisters(true); + m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + m_emit->Ret(); + + CodeBlock::HostCodePointer ptr; + u32 code_size; + FinalizeBlock(&ptr, &code_size); + Log_InfoPrintf("Dispatcher is %u bytes at %p", code_size, ptr); + return ptr; +} + } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 142f86fad..82fc81cee 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -1,9 +1,12 @@ #include "common/align.h" +#include "common/log.h" #include "cpu_core.h" #include "cpu_core_private.h" #include "cpu_recompiler_code_generator.h" #include "cpu_recompiler_thunks.h" #include "settings.h" +#include "timing_event.h" +Log_SetChannel(Recompiler::CodeGenerator); namespace CPU::Recompiler { @@ -187,10 +190,12 @@ Value CodeGenerator::GetValueInHostRegister(const Value& value, bool allow_zero_ void CodeGenerator::EmitBeginBlock() { + m_register_cache.AssumeCalleeSavedRegistersAreSaved(); + // Store the CPU struct pointer. const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); - m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); + // m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); } void CodeGenerator::EmitEndBlock() @@ -2516,4 +2521,118 @@ void CodeGenerator::EmitBindLabel(LabelType* label) m_emit->L(*label); } +void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr) +{ + const s64 displacement = + static_cast(reinterpret_cast(ptr) - reinterpret_cast(m_emit->getCurr())) + 2; + if (Xbyak::inner::IsInInt32(static_cast(displacement))) + m_emit->lea(GetHostReg64(host_reg), m_emit->dword[m_emit->rip + ptr]); + else + m_emit->mov(GetHostReg64(host_reg), reinterpret_cast(ptr)); +} + +CodeBlock::HostCodePointer CodeGenerator::CompileDispatcher() +{ + m_register_cache.ReserveCallerSavedRegisters(); + + EmitLoadGlobalAddress(Xbyak::Operand::RBP, &g_state); + + Xbyak::Label frame_done_loop; + Xbyak::Label exit_dispatcher; + m_emit->L(frame_done_loop); + + // if frame_done goto exit_dispatcher + m_emit->test(m_emit->byte[m_emit->rbp + offsetof(State, frame_done)], 1); + m_emit->jnz(exit_dispatcher, Xbyak::CodeGenerator::T_NEAR); + + // eax <- sr + Xbyak::Label no_interrupt; + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, cop0_regs.sr.bits)]); + + // if Iec == 0 then goto no_interrupt + m_emit->test(m_emit->eax, 1); + m_emit->jz(no_interrupt); + + // sr & cause + m_emit->and_(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, cop0_regs.cause.bits)]); + + // ((sr & cause) & 0xff00) == 0 goto no_interrupt + m_emit->test(m_emit->eax, 0xFF00); + m_emit->jz(no_interrupt); + + // we have an interrupt + EmitFunctionCall(nullptr, &DispatchInterrupt); + + // no interrupt or we just serviced it + m_emit->L(no_interrupt); + + // TimingEvents::UpdateCPUDowncount: + // eax <- head event->downcount + // downcount <- eax + EmitLoadGlobalAddress(Xbyak::Operand::RAX, TimingEvents::GetHeadEventPtr()); + m_emit->mov(m_emit->rax, m_emit->qword[m_emit->rax]); + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rax + offsetof(TimingEvent, m_downcount)]); + m_emit->mov(m_emit->dword[m_emit->rbp + offsetof(State, downcount)], m_emit->eax); + + // main dispatch loop + Xbyak::Label main_loop; + m_emit->align(16); + m_emit->L(main_loop); + + // eax <- pending_ticks + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, pending_ticks)]); + + // while eax < downcount + Xbyak::Label downcount_hit; + m_emit->cmp(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, downcount)]); + m_emit->jge(downcount_hit); + + // time to lookup the block + // eax <- pc + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, regs.pc)]); + + // ebx <- (pc & RAM_MASK) >> 2 + m_emit->mov(m_emit->ebx, m_emit->eax); + m_emit->and_(m_emit->ebx, Bus::RAM_MASK); + m_emit->shr(m_emit->ebx, 2); + + // ecx <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT + m_emit->mov(m_emit->ecx, m_emit->eax); + m_emit->and_(m_emit->ecx, Bus::BIOS_MASK); + m_emit->shr(m_emit->ecx, 2); + m_emit->add(m_emit->ecx, FAST_MAP_RAM_SLOT_COUNT); + + // current_instruction_pc <- pc (eax) + m_emit->mov(m_emit->dword[m_emit->rbp + offsetof(State, current_instruction_pc)], m_emit->eax); + + // if ((eax (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use ecx as index } + m_emit->and_(m_emit->eax, PHYSICAL_MEMORY_ADDRESS_MASK); + m_emit->cmp(m_emit->eax, Bus::BIOS_BASE); + m_emit->cmovge(m_emit->ebx, m_emit->ecx); + + // ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue + EmitLoadGlobalAddress(Xbyak::Operand::RAX, CodeCache::GetFastMapPointer()); + m_emit->mov(m_emit->rax, m_emit->qword[m_emit->rax + m_emit->rbx * 8]); + m_emit->call(m_emit->rax); + m_emit->jmp(main_loop); + + // end while + m_emit->L(downcount_hit); + + // check events then for frame done + EmitFunctionCall(nullptr, &TimingEvents::RunEvents); + m_emit->jmp(frame_done_loop); + + // all done + m_emit->L(exit_dispatcher); + m_register_cache.PopCalleeSavedRegisters(true); + m_emit->ret(); + + CodeBlock::HostCodePointer ptr; + u32 code_size; + FinalizeBlock(&ptr, &code_size); + Log_InfoPrintf("Dispatcher is %u bytes at %p", code_size, ptr); + return ptr; +} + } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_register_cache.cpp b/src/core/cpu_recompiler_register_cache.cpp index dac853c81..91738eead 100644 --- a/src/core/cpu_recompiler_register_cache.cpp +++ b/src/core/cpu_recompiler_register_cache.cpp @@ -351,6 +351,33 @@ u32 RegisterCache::PopCalleeSavedRegisters(bool commit) return count; } +void RegisterCache::ReserveCallerSavedRegisters() +{ + for (u32 reg = 0; reg < HostReg_Count; reg++) + { + if ((m_state.host_reg_state[reg] & (HostRegState::CalleeSaved | HostRegState::CalleeSavedAllocated)) == + HostRegState::CalleeSaved) + { + DebugAssert(m_state.callee_saved_order_count < HostReg_Count); + m_code_generator.EmitPushHostReg(static_cast(reg), GetActiveCalleeSavedRegisterCount()); + m_state.callee_saved_order[m_state.callee_saved_order_count++] = static_cast(reg); + m_state.host_reg_state[reg] |= HostRegState::CalleeSavedAllocated; + } + } +} + +void RegisterCache::AssumeCalleeSavedRegistersAreSaved() +{ + for (u32 i = 0; i < HostReg_Count; i++) + { + if ((m_state.host_reg_state[i] & (HostRegState::CalleeSaved | HostRegState::CalleeSavedAllocated)) == + HostRegState::CalleeSaved) + { + m_state.host_reg_state[i] &= ~HostRegState::CalleeSaved; + } + } +} + void RegisterCache::PushState() { // need to copy this manually because of the load delay values diff --git a/src/core/cpu_recompiler_register_cache.h b/src/core/cpu_recompiler_register_cache.h index 0c989f296..b1092bef7 100644 --- a/src/core/cpu_recompiler_register_cache.h +++ b/src/core/cpu_recompiler_register_cache.h @@ -248,6 +248,12 @@ public: /// Restore callee-saved registers. Call at the end of the function. u32 PopCalleeSavedRegisters(bool commit); + /// Preallocates caller saved registers, enabling later use without stack pushes. + void ReserveCallerSavedRegisters(); + + /// Removes the callee-saved register flag from all registers. Call when compiling code blocks. + void AssumeCalleeSavedRegistersAreSaved(); + /// Pushes the register allocator state, use when entering branched code. void PushState(); diff --git a/src/core/dma.cpp b/src/core/dma.cpp index d685f5158..e4168e88b 100644 --- a/src/core/dma.cpp +++ b/src/core/dma.cpp @@ -4,6 +4,7 @@ #include "common/log.h" #include "common/state_wrapper.h" #include "common/string_util.h" +#include "cpu_code_cache.h" #include "cpu_core.h" #include "gpu.h" #include "interrupt_controller.h" @@ -499,7 +500,7 @@ TickCount DMA::TransferDeviceToMemory(Channel channel, u32 address, u32 incremen const u32 terminator = UINT32_C(0xFFFFFF); std::memcpy(&ram_pointer[address], &terminator, sizeof(terminator)); - Bus::InvalidateCodePages(address, word_count); + CPU::CodeCache::InvalidateCodePages(address, word_count); return Bus::GetDMARAMTickCount(word_count); } @@ -547,6 +548,6 @@ TickCount DMA::TransferDeviceToMemory(Channel channel, u32 address, u32 incremen } } - Bus::InvalidateCodePages(address, word_count); + CPU::CodeCache::InvalidateCodePages(address, word_count); return Bus::GetDMARAMTickCount(word_count); } diff --git a/src/core/timing_event.cpp b/src/core/timing_event.cpp index 8197eff80..4d1595f85 100644 --- a/src/core/timing_event.cpp +++ b/src/core/timing_event.cpp @@ -53,6 +53,11 @@ void UpdateCPUDowncount() CPU::g_state.downcount = s_active_events_head->GetDowncount(); } +TimingEvent** GetHeadEventPtr() +{ + return &s_active_events_head; +} + static void SortEvent(TimingEvent* event) { const TickCount event_downcount = event->m_downcount; diff --git a/src/core/timing_event.h b/src/core/timing_event.h index ca58ddbdf..0e012a1d7 100644 --- a/src/core/timing_event.h +++ b/src/core/timing_event.h @@ -88,6 +88,8 @@ void RunEvents(); void UpdateCPUDowncount(); +TimingEvent** GetHeadEventPtr(); + } // namespace TimingEventManager \ No newline at end of file