CPU/Recompiler: Use condition select for ICache updates

Tidy ~4% perf boost.
This commit is contained in:
Stenzek 2024-12-15 16:00:01 +10:00
parent 666fee2df7
commit 9d52e27e16
No known key found for this signature in database
3 changed files with 32 additions and 23 deletions

View File

@ -617,13 +617,19 @@ void CPU::ARM32Recompiler::GenerateICacheCheckAndUpdate()
} }
else if (m_block->icache_line_count > 0) else if (m_block->icache_line_count > 0)
{ {
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
if (fill_ticks <= 0)
return;
const auto& ticks_reg = RARG1; const auto& ticks_reg = RARG1;
const auto& current_tag_reg = RARG2; const auto& current_tag_reg = RARG2;
const auto& existing_tag_reg = RARG3; const auto& existing_tag_reg = RARG3;
const auto& fill_ticks_reg = r5;
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks)); armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
armEmitMov(armAsm, current_tag_reg, current_pc); armEmitMov(armAsm, current_tag_reg, current_pc);
armEmitMov(armAsm, fill_ticks_reg, fill_ticks);
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE) for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
{ {
@ -644,12 +650,9 @@ void CPU::ARM32Recompiler::GenerateICacheCheckAndUpdate()
Label cache_hit; Label cache_hit;
armAsm->ldr(existing_tag_reg, line_addr); armAsm->ldr(existing_tag_reg, line_addr);
armAsm->cmp(existing_tag_reg, current_tag_reg);
armAsm->b(eq, &cache_hit);
armAsm->str(current_tag_reg, line_addr); armAsm->str(current_tag_reg, line_addr);
armAsm->add(ticks_reg, ticks_reg, armCheckAddSubConstant(static_cast<u32>(fill_ticks))); armAsm->cmp(existing_tag_reg, current_tag_reg);
armAsm->bind(&cache_hit); armAsm->add(ne, ticks_reg, ticks_reg, fill_ticks_reg);
if (i != (m_block->icache_line_count - 1)) if (i != (m_block->icache_line_count - 1))
armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE)); armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));

View File

@ -780,28 +780,29 @@ void CPU::ARM64Recompiler::GenerateICacheCheckAndUpdate()
const auto& ticks_reg = RWARG1; const auto& ticks_reg = RWARG1;
const auto& current_tag_reg = RWARG2; const auto& current_tag_reg = RWARG2;
const auto& existing_tag_reg = RWARG3; const auto& existing_tag_reg = RWARG3;
const auto& fill_ticks_reg = w4;
const auto& ticks_to_add_reg = w5;
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK; VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
if (fill_ticks <= 0)
return;
armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks)); armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
armEmitMov(armAsm, current_tag_reg, current_pc); armEmitMov(armAsm, current_tag_reg, current_pc);
armEmitMov(armAsm, fill_ticks_reg, fill_ticks);
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE) for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
{ {
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
if (fill_ticks <= 0)
continue;
const u32 line = GetICacheLine(current_pc); const u32 line = GetICacheLine(current_pc);
const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32)); const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
Label cache_hit; Label cache_hit;
armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset)); armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));
armAsm->cmp(existing_tag_reg, current_tag_reg);
armAsm->b(&cache_hit, eq);
armAsm->str(current_tag_reg, MemOperand(RSTATE, offset)); armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));
armAsm->add(ticks_reg, ticks_reg, armCheckAddSubConstant(static_cast<u32>(fill_ticks))); armAsm->cmp(existing_tag_reg, current_tag_reg);
armAsm->bind(&cache_hit); armAsm->csel(ticks_to_add_reg, fill_ticks_reg, wzr, ne);
armAsm->add(ticks_reg, ticks_reg, ticks_to_add_reg);
if (i != (m_block->icache_line_count - 1)) if (i != (m_block->icache_line_count - 1))
armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE)); armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));

View File

@ -506,27 +506,32 @@ void CPU::X64Recompiler::GenerateICacheCheckAndUpdate()
} }
else if (m_block->icache_line_count > 0) else if (m_block->icache_line_count > 0)
{ {
// RAM to ROM is not contiguous, therefore the cost will be the same across the entire block.
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
if (fill_ticks <= 0)
return;
cg->lea(RXARG1, cg->dword[PTR(&g_state.icache_tags)]); cg->lea(RXARG1, cg->dword[PTR(&g_state.icache_tags)]);
cg->xor_(RWARG2, RWARG2);
cg->mov(RWARG4, fill_ticks);
// TODO: Vectorize this... // TODO: Vectorize this...
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE) for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
{ {
const VirtualMemoryAddress tag = GetICacheTagForAddress(current_pc); const VirtualMemoryAddress tag = GetICacheTagForAddress(current_pc);
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
if (fill_ticks <= 0)
continue;
const u32 line = GetICacheLine(current_pc); const u32 line = GetICacheLine(current_pc);
const u32 offset = (line * sizeof(u32)); const u32 offset = (line * sizeof(u32));
Xbyak::Label cache_hit;
cg->xor_(RWARG3, RWARG3);
cg->cmp(cg->dword[RXARG1 + offset], tag); cg->cmp(cg->dword[RXARG1 + offset], tag);
cg->je(cache_hit);
cg->mov(cg->dword[RXARG1 + offset], tag); cg->mov(cg->dword[RXARG1 + offset], tag);
cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast<u32>(fill_ticks)); cg->cmovne(RWARG3, RWARG4);
cg->L(cache_hit); cg->add(RWARG2, RWARG3);
} }
cg->add(cg->dword[PTR(&g_state.pending_ticks)], RWARG2);
} }
} }