CPU/Recompiler: Align dispatchers and JIT blocks

A couple of percent difference if we're lucky. Practically probably <1%.
This commit is contained in:
Stenzek 2024-12-29 18:11:39 +10:00
parent 82a843c121
commit 242561debf
No known key found for this signature in database
8 changed files with 127 additions and 55 deletions

View File

@ -1492,18 +1492,15 @@ void CPU::CodeCache::CommitFarCode(u32 length)
void CPU::CodeCache::AlignCode(u32 alignment)
{
#if defined(CPU_ARCH_X64)
constexpr u8 padding_value = 0xcc; // int3
#else
constexpr u8 padding_value = 0x00;
#endif
DebugAssert(Common::IsPow2(alignment));
const u32 num_padding_bytes =
std::min(static_cast<u32>(Common::AlignUpPow2(reinterpret_cast<uintptr_t>(s_free_code_ptr), alignment) -
reinterpret_cast<uintptr_t>(s_free_code_ptr)),
GetFreeCodeSpace());
std::memset(s_free_code_ptr, padding_value, num_padding_bytes);
if (num_padding_bytes > 0)
EmitAlignmentPadding(s_free_code_ptr, num_padding_bytes);
s_free_code_ptr += num_padding_bytes;
s_code_used += num_padding_bytes;
}

View File

@ -247,6 +247,7 @@ bool HasPreviouslyFaultedOnPC(u32 guest_pc);
u32 EmitASMFunctions(void* code, u32 code_size);
u32 EmitJump(void* code, const void* dst, bool flush_icache);
void EmitAlignmentPadding(void* dst, size_t size);
void DisassembleAndLogHostCode(const void* start, u32 size);
u32 GetHostInstructionCount(const void* start, u32 size);

View File

@ -34,7 +34,7 @@ CPU::Recompiler::Recompiler::Recompiler() = default;
CPU::Recompiler::Recompiler::~Recompiler() = default;
void CPU::Recompiler::Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,
u8* far_code_buffer, u32 far_code_space)
u8* far_code_buffer, u32 far_code_space)
{
m_block = block;
m_compiler_pc = block->pc;
@ -101,10 +101,12 @@ void CPU::Recompiler::Recompiler::BeginBlock()
}
const void* CPU::Recompiler::Recompiler::CompileBlock(CodeCache::Block* block, u32* host_code_size,
u32* host_far_code_size)
u32* host_far_code_size)
{
Reset(block, CPU::CodeCache::GetFreeCodePointer(), CPU::CodeCache::GetFreeCodeSpace(),
CPU::CodeCache::GetFreeFarCodePointer(), CPU::CodeCache::GetFreeFarCodeSpace());
CodeCache::AlignCode(FUNCTION_ALIGNMENT);
Reset(block, CodeCache::GetFreeCodePointer(), CodeCache::GetFreeCodeSpace(), CodeCache::GetFreeFarCodePointer(),
CodeCache::GetFreeFarCodeSpace());
DEBUG_LOG("Block range: {:08X} -> {:08X}", block->pc, block->pc + block->size * 4);
@ -144,8 +146,8 @@ const void* CPU::Recompiler::Recompiler::CompileBlock(CodeCache::Block* block, u
const void* code = EndCompile(&code_size, &far_code_size);
*host_code_size = code_size;
*host_far_code_size = far_code_size;
CPU::CodeCache::CommitCode(code_size);
CPU::CodeCache::CommitFarCode(far_code_size);
CodeCache::CommitCode(code_size);
CodeCache::CommitFarCode(far_code_size);
return code;
}
@ -651,7 +653,7 @@ const char* CPU::Recompiler::Recompiler::GetReadWriteModeString(u32 flags)
}
u32 CPU::Recompiler::Recompiler::AllocateHostReg(u32 flags, HostRegAllocType type /* = HR_TYPE_TEMP */,
Reg reg /* = Reg::count */)
Reg reg /* = Reg::count */)
{
// Cancel any load delays before booting anything out
if (flags & HR_MODE_WRITE && (type == HR_TYPE_CPU_REG || type == HR_TYPE_NEXT_LOAD_DELAY_VALUE))
@ -753,7 +755,7 @@ u32 CPU::Recompiler::Recompiler::AllocateHostReg(u32 flags, HostRegAllocType typ
}
std::optional<u32> CPU::Recompiler::Recompiler::CheckHostReg(u32 flags, HostRegAllocType type /* = HR_TYPE_TEMP */,
Reg reg /* = Reg::count */)
Reg reg /* = Reg::count */)
{
for (u32 i = 0; i < NUM_HOST_REGS; i++)
{
@ -1158,7 +1160,8 @@ void CPU::Recompiler::Recompiler::RestoreHostState()
}
void CPU::Recompiler::Recompiler::AddLoadStoreInfo(void* code_address, u32 code_size, u32 address_register,
u32 data_register, MemoryAccessSize size, bool is_signed, bool is_load)
u32 data_register, MemoryAccessSize size, bool is_signed,
bool is_load)
{
DebugAssert(CodeCache::IsUsingFastmem());
DebugAssert(address_register < NUM_HOST_REGS);
@ -1367,8 +1370,8 @@ void CPU::Recompiler::Recompiler::CompileBranchDelaySlot(bool dirty_pc /* = true
}
void CPU::Recompiler::Recompiler::CompileTemplate(void (Recompiler::*const_func)(CompileFlags),
void (Recompiler::*func)(CompileFlags), const void* pgxp_cpu_func,
u32 tflags)
void (Recompiler::*func)(CompileFlags), const void* pgxp_cpu_func,
u32 tflags)
{
// TODO: This is where we will do memory operand optimization. Remember to kill constants!
// TODO: Swap S and T if commutative
@ -1733,7 +1736,7 @@ const TickCount* CPU::Recompiler::Recompiler::GetFetchMemoryAccessTimePtr() cons
}
void CPU::Recompiler::Recompiler::FlushForLoadStore(const std::optional<VirtualMemoryAddress>& address, bool store,
bool use_fastmem)
bool use_fastmem)
{
if (use_fastmem)
return;

View File

@ -34,6 +34,9 @@ public:
static constexpr u32 NUM_HOST_REGS = 16;
static constexpr bool HAS_MEMORY_OPERANDS = true;
// Align functions to 16 bytes.
static constexpr u32 FUNCTION_ALIGNMENT = 16;
#elif defined(CPU_ARCH_ARM32)
// A reasonable "maximum" number of bytes per instruction.
@ -44,6 +47,9 @@ public:
static constexpr u32 NUM_HOST_REGS = 16;
static constexpr bool HAS_MEMORY_OPERANDS = false;
// Align functions to 4 bytes (word size).
static constexpr u32 FUNCTION_ALIGNMENT = 16;
#elif defined(CPU_ARCH_ARM64)
// A reasonable "maximum" number of bytes per instruction.
@ -56,6 +62,9 @@ public:
static constexpr u32 NUM_HOST_REGS = 32;
static constexpr bool HAS_MEMORY_OPERANDS = false;
// Align functions to 16 bytes.
static constexpr u32 FUNCTION_ALIGNMENT = 16;
#elif defined(CPU_ARCH_RISCV64)
// Number of host registers.
@ -68,6 +77,9 @@ public:
static constexpr u32 MAX_NEAR_HOST_BYTES_PER_INSTRUCTION = 64;
static constexpr u32 MIN_CODE_RESERVE_FOR_BLOCK = 512;
// Align functions to 16 bytes.
static constexpr u32 FUNCTION_ALIGNMENT = 16;
#endif
public:

View File

@ -320,14 +320,17 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
armAsm->FinalizeCode();
#if 0
// TODO: align?
s_trampoline_targets.clear();
s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();
s_trampoline_used = 0;
#endif
return static_cast<u32>(armAsm->GetCursorOffset()) /* + TRAMPOLINE_AREA_SIZE*/;
return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;
}
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
{
constexpr u8 padding_value = 0x00;
std::memset(dst, padding_value, size);
}
CPU::ARM32Recompiler::ARM32Recompiler() : m_emitter(A32), m_far_emitter(A32)
@ -1025,7 +1028,8 @@ void CPU::ARM32Recompiler::Flush(u32 flags)
void CPU::ARM32Recompiler::Compile_Fallback()
{
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, inst->bits);
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
inst->bits);
Flush(FLUSH_FOR_INTERPRETER);

View File

@ -41,19 +41,20 @@ LOG_CHANNEL(Recompiler);
#define RSTATE vixl::aarch64::x19
#define RMEMBASE vixl::aarch64::x20
bool armIsCallerSavedRegister(u32 id);
s64 armGetPCDisplacement(const void* current, const void* target);
bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr);
void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr);
void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm);
void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr);
void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
bool sign_extend_word = false);
void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
const vixl::aarch64::Register& tempreg = RXSCRATCH);
u8* armGetJumpTrampoline(const void* target);
static bool armIsCallerSavedRegister(u32 id);
static s64 armGetPCDisplacement(const void* current, const void* target);
static bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr);
static void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr);
static void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm);
static void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
static void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
static void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr);
static void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
bool sign_extend_word = false);
static void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
const vixl::aarch64::Register& tempreg = RXSCRATCH);
static u8* armGetJumpTrampoline(const void* target);
static void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment);
static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;
static std::unordered_map<const void*, u32> s_trampoline_targets;
@ -327,8 +328,8 @@ void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Regis
armAsm->ldr(reg, memop);
}
void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
const vixl::aarch64::Register& tempreg)
[[maybe_unused]] void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg,
const void* addr, const vixl::aarch64::Register& tempreg)
{
DebugAssert(tempreg.IsX());
@ -359,7 +360,7 @@ u8* armGetJumpTrampoline(const void* target)
return s_trampoline_start_ptr + it->second;
// align to 16 bytes?
const u32 offset = s_trampoline_used; // Common::AlignUpPow2(s_trampoline_used, 16);
const u32 offset = Common::AlignUpPow2(s_trampoline_used, CPU::Recompiler::FUNCTION_ALIGNMENT);
// 4 movs plus a jump
if (TRAMPOLINE_AREA_SIZE - offset < 20)
@ -387,6 +388,17 @@ u8* armGetJumpTrampoline(const void* target)
return start;
}
void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment)
{
size_t addr = armAsm->GetCursorAddress<size_t>();
const size_t end_addr = Common::AlignUpPow2(addr, alignment);
while (addr != end_addr)
{
armAsm->nop();
addr += vixl::aarch64::kInstructionSize;
}
}
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
{
#ifdef ENABLE_HOST_DISASSEMBLY
@ -434,7 +446,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
using namespace vixl::aarch64;
Assembler actual_asm(static_cast<u8*>(code), code_size);
Assembler* armAsm = &actual_asm;
Assembler* RESTRICT armAsm = &actual_asm;
#ifdef VIXL_DEBUG
vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
@ -455,21 +467,19 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
}
// check events then for frame done
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
g_check_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
{
Label skip_event_check;
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
armAsm->cmp(RWARG1, RWARG2);
armAsm->b(&skip_event_check, lt);
armAsm->b(&dispatch, lt);
g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);
armAsm->bind(&skip_event_check);
}
// TODO: align?
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
g_dispatcher = armAsm->GetCursorAddress<const void*>();
{
armAsm->bind(&dispatch);
@ -486,6 +496,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
armAsm->br(RXARG1);
}
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();
{
armAsm->ldr(RWARG1, PTR(&g_state.pc));
@ -493,6 +504,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
armAsm->b(&dispatch);
}
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();
{
armAsm->ldr(RWARG1, PTR(&g_state.pc));
@ -500,6 +512,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
armAsm->b(&dispatch);
}
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
g_interpret_block = armAsm->GetCursorAddress<const void*>();
{
armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);
@ -508,7 +521,6 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
armAsm->FinalizeCode();
// TODO: align?
s_trampoline_targets.clear();
s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();
s_trampoline_used = 0;
@ -516,6 +528,12 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;
}
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
{
constexpr u8 padding_value = 0x00;
std::memset(dst, padding_value, size);
}
CPU::ARM64Recompiler::ARM64Recompiler() : m_emitter(PositionDependentCode), m_far_emitter(PositionIndependentCode)
{
}
@ -1174,7 +1192,8 @@ void CPU::ARM64Recompiler::Flush(u32 flags)
void CPU::ARM64Recompiler::Compile_Fallback()
{
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, inst->bits);
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
inst->bits);
Flush(FLUSH_FOR_INTERPRETER);

View File

@ -317,6 +317,12 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
return static_cast<u32>(rvAsm->GetCodeBuffer().GetSizeInBytes());
}
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
{
constexpr u8 padding_value = 0x00;
std::memset(dst, padding_value, size);
}
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
{
// TODO: get rid of assembler construction here
@ -998,7 +1004,8 @@ void CPU::RISCV64Recompiler::Flush(u32 flags)
void CPU::RISCV64Recompiler::Compile_Fallback()
{
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, inst->bits);
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
inst->bits);
Flush(FLUSH_FOR_INTERPRETER);

View File

@ -36,6 +36,7 @@ LOG_CHANNEL(Recompiler);
// PGXP TODO: LWL etc, MFC0
// PGXP TODO: Spyro 1 level gates have issues.
static constexpr u32 FUNCTION_ALIGNMENT = 16;
static constexpr u32 BACKPATCH_JMP_SIZE = 5;
static bool IsCallerSavedRegister(u32 id);
@ -134,20 +135,18 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
}
// check events then for frame done
cg->align(FUNCTION_ALIGNMENT);
g_check_events_and_dispatch = cg->getCurr();
{
Label skip_event_check;
cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]);
cg->jl(skip_event_check);
cg->jl(dispatch);
g_run_events_and_dispatch = cg->getCurr();
cg->call(reinterpret_cast<const void*>(&TimingEvents::RunEvents));
cg->L(skip_event_check);
}
// TODO: align?
cg->align(FUNCTION_ALIGNMENT);
g_dispatcher = cg->getCurr();
{
cg->L(dispatch);
@ -164,6 +163,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
cg->jmp(cg->qword[RXARG2 + RXARG1 * 2]);
}
cg->align(FUNCTION_ALIGNMENT);
g_compile_or_revalidate_block = cg->getCurr();
{
cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]);
@ -171,6 +171,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
cg->jmp(dispatch);
}
cg->align(FUNCTION_ALIGNMENT);
g_discard_and_recompile_block = cg->getCurr();
{
cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]);
@ -178,6 +179,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
cg->jmp(dispatch);
}
cg->align(FUNCTION_ALIGNMENT);
g_interpret_block = cg->getCurr();
{
cg->call(CodeCache::GetInterpretUncachedBlockFunction());
@ -201,6 +203,32 @@ u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
return 5;
}
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
{
// Copied from Xbyak nop(), to avoid constructing a CodeGenerator.
static const uint8_t nopTbl[9][9] = {
{0x90},
{0x66, 0x90},
{0x0F, 0x1F, 0x00},
{0x0F, 0x1F, 0x40, 0x00},
{0x0F, 0x1F, 0x44, 0x00, 0x00},
{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
};
const size_t n = sizeof(nopTbl) / sizeof(nopTbl[0]);
u8* dst_ptr = static_cast<u8*>(dst);
while (size > 0)
{
size_t len = (std::min)(n, size);
const uint8_t* seq = nopTbl[len - 1];
std::memcpy(dst_ptr, seq, len);
dst_ptr += len;
size -= len;
}
}
#ifdef ENABLE_HOST_DISASSEMBLY
static ZydisFormatterFunc s_old_print_address;
@ -929,7 +957,8 @@ void CPU::X64Recompiler::Flush(u32 flags)
void CPU::X64Recompiler::Compile_Fallback()
{
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, inst->bits);
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
inst->bits);
Flush(FLUSH_FOR_INTERPRETER);