diff --git a/src/core/cpu_newrec_compiler.cpp b/src/core/cpu_newrec_compiler.cpp index 4e9a50114..eb21f885d 100644 --- a/src/core/cpu_newrec_compiler.cpp +++ b/src/core/cpu_newrec_compiler.cpp @@ -616,8 +616,8 @@ u32 CPU::NewRec::Compiler::GetFreeHostReg(u32 flags) } } - Log_DebugPrintf("Freeing register %s in host register %s for allocation", GetHostRegName(lowest), - GetRegName(ra.reg)); + Log_DebugPrintf("Freeing register %s in host register %s for allocation", GetRegName(ra.reg), + GetHostRegName(lowest)); } break; case HR_TYPE_LOAD_DELAY_VALUE: @@ -628,8 +628,8 @@ u32 CPU::NewRec::Compiler::GetFreeHostReg(u32 flags) break; case HR_TYPE_NEXT_LOAD_DELAY_VALUE: { - Log_DebugPrintf("Freeing next load delay register %s in host register %s due for allocation", - GetHostRegName(lowest), GetRegName(ra.reg)); + Log_DebugPrintf("Freeing next load delay register %s in host register %s due for allocation", GetRegName(ra.reg), + GetHostRegName(lowest)); } break; default: @@ -875,6 +875,7 @@ void CPU::NewRec::Compiler::FlushHostReg(u32 reg) void CPU::NewRec::Compiler::FreeHostReg(u32 reg) { DebugAssert(IsHostRegAllocated(reg)); + Log_DebugPrintf("Freeing host register %s", GetHostRegName(reg)); FlushHostReg(reg); ClearHostReg(reg); } diff --git a/src/core/cpu_newrec_compiler_aarch32.cpp b/src/core/cpu_newrec_compiler_aarch32.cpp index e96897cd4..a15eef73a 100644 --- a/src/core/cpu_newrec_compiler_aarch32.cpp +++ b/src/core/cpu_newrec_compiler_aarch32.cpp @@ -284,8 +284,8 @@ bool foo(const void* a, const void* b) while (size >= 4) { armAsm->ldr(RARG3, MemOperand(RARG1, offset)); - armAsm->ldr(RARG4, MemOperand(RARG2, offset)); - armAsm->cmp(RARG3, RARG4); + armAsm->ldr(RSCRATCH, MemOperand(RARG2, offset)); + armAsm->cmp(RARG3, RSCRATCH); armAsm->b(ne, &block_changed); offset += 4; size -= 4; @@ -723,7 +723,7 @@ void CPU::NewRec::AArch32Compiler::Compile_Fallback() { Flush(FLUSH_FOR_INTERPRETER); - EmitCall(armAsm, reinterpret_cast(&CPU::Recompiler::Thunks::InterpretInstruction)); + EmitCall(reinterpret_cast(&CPU::Recompiler::Thunks::InterpretInstruction)); // TODO: make me less garbage // TODO: this is wrong, it flushes the load delay on the same cycle when we return. @@ -1637,9 +1637,9 @@ void CPU::NewRec::AArch32Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize { // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; // new_value = (value & mask) | (RWRET << (24 - shift)); - EmitMov(RARG4, 0xFFFFFFu); - armAsm->lsr(RARG4, RARG4, RARG2); - armAsm->and_(value, value, RARG4); + EmitMov(RSCRATCH, 0xFFFFFFu); + armAsm->lsr(RSCRATCH, RSCRATCH, RARG2); + armAsm->and_(value, value, RSCRATCH); armAsm->lsl(RRET, RRET, RARG3); armAsm->orr(value, value, RRET); } @@ -1648,9 +1648,9 @@ void CPU::NewRec::AArch32Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); // new_value = (value & mask) | (RWRET >> shift); armAsm->lsr(RRET, RRET, RARG2); - EmitMov(RARG4, 0xFFFFFF00u); - armAsm->lsl(RARG4, RARG4, RARG3); - armAsm->and_(value, value, RARG4); + EmitMov(RSCRATCH, 0xFFFFFF00u); + armAsm->lsl(RSCRATCH, RSCRATCH, RARG3); + armAsm->and_(value, value, RSCRATCH); armAsm->orr(value, value, RRET); } @@ -1857,15 +1857,20 @@ void CPU::NewRec::AArch32Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize void CPU::NewRec::AArch32Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { - FlushForLoadStore(address, true, use_fastmem); - const u32 index = static_cast(inst->r.rt.GetValue()); const auto [ptr, action] = GetGTERegisterPointer(index, false); + const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ? + Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : + RARG1; + const Register data = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2; + FlushForLoadStore(address, true, use_fastmem); + ComputeLoadStoreAddressArg(cf, address, addr); + switch (action) { case GTERegisterAccessAction::Direct: { - armAsm->ldr(RARG2, PTR(ptr)); + armAsm->ldr(data, PTR(ptr)); } break; @@ -1875,7 +1880,7 @@ void CPU::NewRec::AArch32Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz Flush(FLUSH_FOR_C_CALL); EmitMov(RARG1, index); EmitCall(reinterpret_cast(>E::ReadRegister)); - armAsm->mov(RARG2, RRET); + armAsm->mov(data, RRET); } break; @@ -1886,29 +1891,23 @@ void CPU::NewRec::AArch32Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz break; } - // PGXP makes this a giant pain. + GenerateStore(addr, data, size, use_fastmem); if (!g_settings.gpu_pgxp_enable) { - const Register addr = ComputeLoadStoreAddressArg(cf, address); - GenerateStore(addr, RARG2, size, use_fastmem); - return; + if (addr.GetCode() != RARG1.GetCode()) + FreeHostReg(addr.GetCode()); + } + else + { + // TODO: This can be simplified because we don't need to validate in PGXP.. + Flush(FLUSH_FOR_C_CALL); + armAsm->mov(RARG3, data); + FreeHostReg(data.GetCode()); + armAsm->mov(RARG2, addr); + FreeHostReg(addr.GetCode()); + EmitMov(RARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_SWC2)); } - - // TODO: This can be simplified because we don't need to validate in PGXP.. - const Register addr_reg = Register(AllocateTempHostReg(HR_CALLEE_SAVED)); - const Register data_backup = Register(AllocateTempHostReg(HR_CALLEE_SAVED)); - FlushForLoadStore(address, true, use_fastmem); - ComputeLoadStoreAddressArg(cf, address, addr_reg); - armAsm->mov(data_backup, RARG2); - GenerateStore(addr_reg, RARG2, size, use_fastmem); - - Flush(FLUSH_FOR_C_CALL); - armAsm->mov(RARG3, data_backup); - armAsm->mov(RARG2, addr_reg); - FreeHostReg(addr_reg.GetCode()); - FreeHostReg(data_backup.GetCode()); - EmitMov(RARG1, inst->bits); - EmitCall(reinterpret_cast(&PGXP::CPU_SWC2)); } void CPU::NewRec::AArch32Compiler::Compile_mtc0(CompileFlags cf) diff --git a/src/core/cpu_newrec_compiler_aarch64.cpp b/src/core/cpu_newrec_compiler_aarch64.cpp index b040e4fe1..34a1ce5a8 100644 --- a/src/core/cpu_newrec_compiler_aarch64.cpp +++ b/src/core/cpu_newrec_compiler_aarch64.cpp @@ -693,7 +693,7 @@ void CPU::NewRec::AArch64Compiler::Compile_Fallback() { Flush(FLUSH_FOR_INTERPRETER); - EmitCall(armAsm, &CPU::Recompiler::Thunks::InterpretInstruction); + EmitCall(reinterpret_cast(&CPU::Recompiler::Thunks::InterpretInstruction)); // TODO: make me less garbage // TODO: this is wrong, it flushes the load delay on the same cycle when we return. @@ -1616,9 +1616,9 @@ void CPU::NewRec::AArch64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize { // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; // new_value = (value & mask) | (RWRET << (24 - shift)); - EmitMov(RWARG4, 0xFFFFFFu); - armAsm->lsrv(RWARG4, RWARG4, RWARG2); - armAsm->and_(value, value, RWARG4); + EmitMov(RWSCRATCH, 0xFFFFFFu); + armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG2); + armAsm->and_(value, value, RWSCRATCH); armAsm->lslv(RWRET, RWRET, RWARG3); armAsm->orr(value, value, RWRET); } @@ -1627,9 +1627,9 @@ void CPU::NewRec::AArch64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); // new_value = (value & mask) | (RWRET >> shift); armAsm->lsrv(RWRET, RWRET, RWARG2); - EmitMov(RWARG4, 0xFFFFFF00u); - armAsm->lslv(RWARG4, RWARG4, RWARG3); - armAsm->and_(value, value, RWARG4); + EmitMov(RWSCRATCH, 0xFFFFFF00u); + armAsm->lslv(RWSCRATCH, RWSCRATCH, RWARG3); + armAsm->and_(value, value, RWSCRATCH); armAsm->orr(value, value, RWRET); } @@ -1836,15 +1836,20 @@ void CPU::NewRec::AArch64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { - FlushForLoadStore(address, true, use_fastmem); - const u32 index = static_cast(inst->r.rt.GetValue()); const auto [ptr, action] = GetGTERegisterPointer(index, false); + const WRegister addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ? + WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : + RWARG1; + const WRegister data = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2; + FlushForLoadStore(address, true, use_fastmem); + ComputeLoadStoreAddressArg(cf, address, addr); + switch (action) { case GTERegisterAccessAction::Direct: { - armAsm->ldr(RWARG2, PTR(ptr)); + armAsm->ldr(data, PTR(ptr)); } break; @@ -1854,7 +1859,7 @@ void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz Flush(FLUSH_FOR_C_CALL); EmitMov(RWARG1, index); EmitCall(reinterpret_cast(>E::ReadRegister)); - armAsm->mov(RWARG2, RWRET); + armAsm->mov(data, RWRET); } break; @@ -1865,29 +1870,23 @@ void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz break; } - // PGXP makes this a giant pain. + GenerateStore(addr, data, size, use_fastmem); if (!g_settings.gpu_pgxp_enable) { - const WRegister addr = ComputeLoadStoreAddressArg(cf, address); - GenerateStore(addr, RWARG2, size, use_fastmem); - return; + if (addr.GetCode() != RWARG1.GetCode()) + FreeHostReg(addr.GetCode()); + } + else + { + // TODO: This can be simplified because we don't need to validate in PGXP.. + Flush(FLUSH_FOR_C_CALL); + armAsm->mov(RWARG3, data); + FreeHostReg(data.GetCode()); + armAsm->mov(RWARG2, addr); + FreeHostReg(addr.GetCode()); + EmitMov(RWARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_SWC2)); } - - // TODO: This can be simplified because we don't need to validate in PGXP.. - const WRegister addr_reg = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)); - const WRegister data_backup = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)); - FlushForLoadStore(address, true, use_fastmem); - ComputeLoadStoreAddressArg(cf, address, addr_reg); - armAsm->mov(data_backup, RWARG2); - GenerateStore(addr_reg, RWARG2, size, use_fastmem); - - Flush(FLUSH_FOR_C_CALL); - armAsm->mov(RWARG3, data_backup); - armAsm->mov(RWARG2, addr_reg); - EmitMov(RWARG1, inst->bits); - EmitCall(reinterpret_cast(&PGXP::CPU_SWC2)); - FreeHostReg(addr_reg.GetCode()); - FreeHostReg(data_backup.GetCode()); } void CPU::NewRec::AArch64Compiler::Compile_mtc0(CompileFlags cf) diff --git a/src/core/cpu_newrec_compiler_riscv64.cpp b/src/core/cpu_newrec_compiler_riscv64.cpp index 0a0631966..cffc58cb8 100644 --- a/src/core/cpu_newrec_compiler_riscv64.cpp +++ b/src/core/cpu_newrec_compiler_riscv64.cpp @@ -2143,15 +2143,20 @@ void CPU::NewRec::RISCV64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { - FlushForLoadStore(address, true, use_fastmem); - const u32 index = static_cast(inst->r.rt.GetValue()); const auto [ptr, action] = GetGTERegisterPointer(index, false); + const GPR addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ? + GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) : + RARG1; + const GPR data = g_settings.gpu_pgxp_enable ? GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2; + FlushForLoadStore(address, true, use_fastmem); + ComputeLoadStoreAddressArg(cf, address, addr); + switch (action) { case GTERegisterAccessAction::Direct: { - rvAsm->LW(RARG2, PTR(ptr)); + rvAsm->LW(data, PTR(ptr)); } break; @@ -2161,7 +2166,7 @@ void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz Flush(FLUSH_FOR_C_CALL); EmitMov(RARG1, index); EmitCall(reinterpret_cast(>E::ReadRegister)); - rvAsm->MV(RARG2, RRET); + rvAsm->MV(data, RRET); } break; @@ -2172,29 +2177,24 @@ void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz break; } - // PGXP makes this a giant pain. + GenerateStore(addr, data, size, use_fastmem); + if (!g_settings.gpu_pgxp_enable) { - const GPR addr = ComputeLoadStoreAddressArg(cf, address); - GenerateStore(addr, RARG2, size, use_fastmem); - return; + if (addr.Index() != RARG1.Index()) + FreeHostReg(addr.Index()); + } + else + { + // TODO: This can be simplified because we don't need to validate in PGXP.. + Flush(FLUSH_FOR_C_CALL); + rvAsm->MV(RARG3, data); + FreeHostReg(data.Index()); + rvAsm->MV(RARG2, addr); + FreeHostReg(addr.Index()); + EmitMov(RARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_SWC2)); } - - // TODO: This can be simplified because we don't need to validate in PGXP.. - const GPR addr_reg = GPR(AllocateTempHostReg(HR_CALLEE_SAVED)); - const GPR data_backup = GPR(AllocateTempHostReg(HR_CALLEE_SAVED)); - FlushForLoadStore(address, true, use_fastmem); - ComputeLoadStoreAddressArg(cf, address, addr_reg); - rvAsm->MV(data_backup, RARG2); - GenerateStore(addr_reg, RARG2, size, use_fastmem); - - Flush(FLUSH_FOR_C_CALL); - rvAsm->MV(RARG3, data_backup); - rvAsm->MV(RARG2, addr_reg); - EmitMov(RARG1, inst->bits); - EmitCall(reinterpret_cast(&PGXP::CPU_SWC2)); - FreeHostReg(addr_reg.Index()); - FreeHostReg(data_backup.Index()); } void CPU::NewRec::RISCV64Compiler::Compile_mtc0(CompileFlags cf) diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp index 7ff3d6c6a..1c162f209 100644 --- a/src/core/cpu_recompiler_code_generator_aarch32.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp @@ -299,7 +299,6 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) #undef RARG1 #undef RARG2 #undef RARG3 -#undef RARG4 #undef RSCRATCH #undef RSTATE diff --git a/src/core/cpu_recompiler_types.h b/src/core/cpu_recompiler_types.h index a58a70ab7..3dd420a81 100644 --- a/src/core/cpu_recompiler_types.h +++ b/src/core/cpu_recompiler_types.h @@ -117,8 +117,6 @@ constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128; #define RXARG2 vixl::aarch64::x1 #define RWARG3 vixl::aarch64::w2 #define RXARG3 vixl::aarch64::x2 -#define RWARG4 vixl::aarch64::w3 -#define RXARG4 vixl::aarch64::x3 #define RWSCRATCH vixl::aarch64::w16 #define RXSCRATCH vixl::aarch64::x16 #define RSTATE vixl::aarch64::x19