CPU/Recompiler: Use condition select for ICache updates

Tidy ~4% perf boost.
This commit is contained in:
Stenzek 2024-12-15 16:00:01 +10:00
parent 666fee2df7
commit 9d52e27e16
No known key found for this signature in database
3 changed files with 32 additions and 23 deletions

View File

@ -617,13 +617,19 @@ void CPU::ARM32Recompiler::GenerateICacheCheckAndUpdate()
}
else if (m_block->icache_line_count > 0)
{
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
if (fill_ticks <= 0)
return;
const auto& ticks_reg = RARG1;
const auto& current_tag_reg = RARG2;
const auto& existing_tag_reg = RARG3;
const auto& fill_ticks_reg = r5;
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
armEmitMov(armAsm, current_tag_reg, current_pc);
armEmitMov(armAsm, fill_ticks_reg, fill_ticks);
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
{
@ -644,12 +650,9 @@ void CPU::ARM32Recompiler::GenerateICacheCheckAndUpdate()
Label cache_hit;
armAsm->ldr(existing_tag_reg, line_addr);
armAsm->cmp(existing_tag_reg, current_tag_reg);
armAsm->b(eq, &cache_hit);
armAsm->str(current_tag_reg, line_addr);
armAsm->add(ticks_reg, ticks_reg, armCheckAddSubConstant(static_cast<u32>(fill_ticks)));
armAsm->bind(&cache_hit);
armAsm->cmp(existing_tag_reg, current_tag_reg);
armAsm->add(ne, ticks_reg, ticks_reg, fill_ticks_reg);
if (i != (m_block->icache_line_count - 1))
armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));

View File

@ -780,28 +780,29 @@ void CPU::ARM64Recompiler::GenerateICacheCheckAndUpdate()
const auto& ticks_reg = RWARG1;
const auto& current_tag_reg = RWARG2;
const auto& existing_tag_reg = RWARG3;
const auto& fill_ticks_reg = w4;
const auto& ticks_to_add_reg = w5;
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
if (fill_ticks <= 0)
return;
armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
armEmitMov(armAsm, current_tag_reg, current_pc);
armEmitMov(armAsm, fill_ticks_reg, fill_ticks);
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
{
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
if (fill_ticks <= 0)
continue;
const u32 line = GetICacheLine(current_pc);
const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
Label cache_hit;
armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));
armAsm->cmp(existing_tag_reg, current_tag_reg);
armAsm->b(&cache_hit, eq);
armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));
armAsm->add(ticks_reg, ticks_reg, armCheckAddSubConstant(static_cast<u32>(fill_ticks)));
armAsm->bind(&cache_hit);
armAsm->cmp(existing_tag_reg, current_tag_reg);
armAsm->csel(ticks_to_add_reg, fill_ticks_reg, wzr, ne);
armAsm->add(ticks_reg, ticks_reg, ticks_to_add_reg);
if (i != (m_block->icache_line_count - 1))
armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));

View File

@ -506,27 +506,32 @@ void CPU::X64Recompiler::GenerateICacheCheckAndUpdate()
}
else if (m_block->icache_line_count > 0)
{
// RAM to ROM is not contiguous, therefore the cost will be the same across the entire block.
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
if (fill_ticks <= 0)
return;
cg->lea(RXARG1, cg->dword[PTR(&g_state.icache_tags)]);
cg->xor_(RWARG2, RWARG2);
cg->mov(RWARG4, fill_ticks);
// TODO: Vectorize this...
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
{
const VirtualMemoryAddress tag = GetICacheTagForAddress(current_pc);
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
if (fill_ticks <= 0)
continue;
const u32 line = GetICacheLine(current_pc);
const u32 offset = (line * sizeof(u32));
Xbyak::Label cache_hit;
cg->xor_(RWARG3, RWARG3);
cg->cmp(cg->dword[RXARG1 + offset], tag);
cg->je(cache_hit);
cg->mov(cg->dword[RXARG1 + offset], tag);
cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast<u32>(fill_ticks));
cg->L(cache_hit);
cg->cmovne(RWARG3, RWARG4);
cg->add(RWARG2, RWARG3);
}
cg->add(cg->dword[PTR(&g_state.pending_ticks)], RWARG2);
}
}