CPU/Recompiler/AArch32: Load membase on demand

This commit is contained in:
Stenzek 2023-10-19 21:53:57 +10:00
parent cce1ec598c
commit 52e0d8d473
No known key found for this signature in database
4 changed files with 36 additions and 30 deletions

View File

@ -382,7 +382,6 @@ void CPU::NewRec::AArch64Compiler::EndAndLinkBlock(const std::optional<u32>& new
DebugAssert(!m_dirty_pc);
// TODO: try extracting this to a function
// TODO: move the cycle flush in here..
// save cycles for event test
const TickCount cycles = std::exchange(m_cycles, 0);
@ -621,7 +620,12 @@ void CPU::NewRec::AArch64Compiler::Flush(u32 flags)
if (flags & FLUSH_INSTRUCTION_BITS)
{
// This sucks, but it's only used for fallbacks.
Panic("Not implemented");
EmitMov(RWARG1, inst->bits);
EmitMov(RWARG2, m_current_instruction_pc);
EmitMov(RWARG3, m_current_instruction_branch_delay_slot);
armAsm->str(RWARG1, PTR(&g_state.current_instruction.bits));
armAsm->str(RWARG2, PTR(&g_state.current_instruction_pc));
armAsm->strb(RWARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));
}
if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
@ -699,26 +703,23 @@ void CPU::NewRec::AArch64Compiler::Compile_Fallback()
{
Flush(FLUSH_FOR_INTERPRETER);
#if 0
cg->call(&CPU::Recompiler::Thunks::InterpretInstruction);
EmitCall(armAsm, &CPU::Recompiler::Thunks::InterpretInstruction);
// TODO: make me less garbage
// TODO: this is wrong, it flushes the load delay on the same cycle when we return.
// but nothing should be going through here..
Label no_load_delay;
cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]);
cg->cmp(RWARG1, static_cast<u8>(Reg::count));
cg->je(no_load_delay, CodeGenerator::T_SHORT);
cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]);
cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1);
cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2);
cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast<u32>(Reg::count));
cg->L(no_load_delay);
armAsm->ldrb(RWARG1, PTR(&g_state.next_load_delay_reg));
armAsm->cmp(RWARG1, static_cast<u8>(Reg::count));
armAsm->b(&no_load_delay, eq);
armAsm->ldr(RWARG2, PTR(&g_state.next_load_delay_value));
armAsm->strb(RWARG1, PTR(&g_state.load_delay_reg));
armAsm->str(RWARG2, PTR(&g_state.load_delay_value));
EmitMov(RWARG1, static_cast<u32>(Reg::count));
armAsm->strb(RWARG1, PTR(&g_state.next_load_delay_reg));
armAsm->bind(&no_load_delay);
m_load_delay_dirty = EMULATE_LOAD_DELAYS;
#else
Panic("Fixme");
#endif
}
void CPU::NewRec::AArch64Compiler::CheckBranchTarget(const vixl::aarch64::WRegister& pcreg)

View File

@ -122,6 +122,7 @@ public:
const Value& address, RegSize size, const Value& value);
void EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, const Value& value, bool in_far_code);
void EnsureMembaseLoaded();
void EmitUpdateFastmemBase();
// Unconditional branch to pointer. May allocate a scratch register.
@ -291,6 +292,7 @@ private:
bool m_load_delay_dirty = false;
bool m_next_load_delay_dirty = false;
bool m_gte_busy_cycles_dirty = false;
bool m_membase_loaded = false;
//////////////////////////////////////////////////////////////////////////
// Speculative Constants

View File

@ -224,13 +224,12 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
#undef RARG3
#undef RARG4
#undef RSCRATCH
#undef RMEMBASE
#undef RSTATE
namespace CPU::Recompiler {
constexpr HostReg RCPUPTR = 4;
constexpr HostReg RMEMBASEPTR = 5;
constexpr HostReg RMEMBASEPTR = 3;
constexpr HostReg RRETURN = 0;
constexpr HostReg RARG1 = 0;
constexpr HostReg RARG2 = 1;
@ -385,14 +384,6 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */)
// m_emit->Mov(GetCPUPtrReg(), reinterpret_cast<uintptr_t>(&g_state));
DebugAssert(cpu_reg_allocated);
UNREFERENCED_VARIABLE(cpu_reg_allocated);
// If there's loadstore instructions, preload the fastmem base.
if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions))
{
const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR);
Assert(fastmem_reg_allocated);
m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base)));
}
}
}
@ -400,9 +391,6 @@ void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, const void* j
{
if (free_registers)
{
if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions))
m_register_cache.FreeHostReg(RMEMBASEPTR);
m_register_cache.FreeHostReg(RCPUPTR);
m_register_cache.FreeHostReg(14);
m_register_cache.PopCalleeSavedRegisters(true);
@ -1058,6 +1046,7 @@ void CodeGenerator::EmitSetConditionResult(HostReg to_reg, RegSize to_size, Cond
u32 CodeGenerator::PrepareStackForCall()
{
m_register_cache.PushCallerSavedRegisters();
m_membase_loaded = false;
return 0;
}
@ -1351,13 +1340,24 @@ void CodeGenerator::EmitAddCPUStructField(u32 offset, const Value& value)
}
}
void CodeGenerator::EnsureMembaseLoaded()
{
if (m_membase_loaded)
return;
m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base)));
m_membase_loaded = true;
}
void CodeGenerator::EmitUpdateFastmemBase()
{
m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base)));
m_membase_loaded = false;
}
void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size, Value& result)
{
EnsureMembaseLoaded();
HostReg address_reg;
if (address.IsConstant())
{
@ -1396,6 +1396,8 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size,
void CodeGenerator::EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, Value& result)
{
EnsureMembaseLoaded();
HostReg address_reg;
if (address.IsConstant())
{
@ -1538,6 +1540,8 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(Instruction instruction, const Co
void CodeGenerator::EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, const Value& value)
{
EnsureMembaseLoaded();
Value actual_value = GetValueInHostRegister(value);
HostReg address_reg;

View File

@ -84,7 +84,6 @@ constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128;
#define RARG4 vixl::aarch32::r3
#define RSCRATCH vixl::aarch32::r12
#define RSTATE vixl::aarch32::r4
#define RMEMBASE vixl::aarch32::r5
s32 armGetPCDisplacement(const void* current, const void* target);
bool armIsPCDisplacementInImmediateRange(s32 displacement);