PPU Analyser: compile certain functions on per-instruction basis

PPU LLVM: optimize small blocks
This commit is contained in:
Nekotekina 2021-01-27 11:50:51 +03:00
parent 891ebd0cb1
commit 8a029159cd
2 changed files with 101 additions and 16 deletions

View File

@ -633,7 +633,7 @@ void ppu_module::analyse(u32 lib_toc, u32 entry)
{
if (!_seg.addr) continue;
if (value >= _seg.addr && value < _seg.addr + _seg.size)
if (value >= start && value < end)
{
addr_heap.emplace(value);
break;
@ -1527,6 +1527,25 @@ void ppu_module::analyse(u32 lib_toc, u32 entry)
// Decompose functions to basic blocks
for (auto&& [_, func] : as_rvalue(std::move(fmap)))
{
if (func.attr & ppu_attr::no_size && entry)
{
// Disabled for PRX for now
const u32 lim = get_limit(func.addr);
ppu_log.warning("Function 0x%x will be compiled on per-instruction basis (next=0x%x)", func.addr, lim);
for (u32 addr = func.addr; addr < lim; addr += 4)
{
auto& block = fmap[addr];
block.addr = addr;
block.size = 4;
block.toc = func.toc;
block.attr = ppu_attr::no_size;
}
continue;
}
for (auto [addr, size] : func.blocks)
{
if (!size)
@ -1583,7 +1602,7 @@ void ppu_module::analyse(u32 lib_toc, u32 entry)
case 109:
case 110:
{
ppu_log.notice("Added block from reloc: 0x%x (0x%x, %u)", target, rel.addr, rel.type);
ppu_log.trace("Added block from reloc: 0x%x (0x%x, %u) (heap=%d)", target, rel.addr, rel.type, addr_heap.count(target));
block_queue.emplace_back(target, 0);
block_set.emplace(target);
continue;
@ -1598,8 +1617,11 @@ void ppu_module::analyse(u32 lib_toc, u32 entry)
u32 exp = start;
u32 lim = end;
// Start with full scan
block_queue.emplace_back(exp, lim);
// Start with full scan (disabled for PRX for now)
if (entry)
{
block_queue.emplace_back(exp, lim);
}
// block_queue may grow
for (usz i = 0; i < block_queue.size(); i++)
@ -1731,6 +1753,11 @@ void ppu_module::analyse(u32 lib_toc, u32 entry)
block.addr = exp;
block.size = i_pos - exp;
ppu_log.trace("Block __0x%x added (size=0x%x)", block.addr, block.size);
if (get_limit(exp) == end)
{
block.attr += ppu_attr::no_size;
}
}
}
@ -1750,9 +1777,26 @@ void ppu_module::analyse(u32 lib_toc, u32 entry)
}
// Convert map to vector (destructive)
for (auto&& pair : as_rvalue(std::move(fmap)))
for (auto&& [_, block] : as_rvalue(std::move(fmap)))
{
funcs.emplace_back(std::move(pair.second));
if (block.attr & ppu_attr::no_size && block.size > 4 && entry)
{
// Disabled for PRX for now
ppu_log.warning("Block 0x%x will be compiled on per-instruction basis (size=0x%x)", block.addr, block.size);
for (u32 addr = block.addr; addr < block.addr + block.size; addr += 4)
{
auto& i = funcs.emplace_back();
i.addr = addr;
i.size = 4;
i.toc = block.toc;
i.attr = ppu_attr::no_size;
}
continue;
}
funcs.emplace_back(std::move(block));
}
ppu_log.notice("Block analysis: %zu blocks (%zu enqueued)", funcs.size(), block_queue.size());

View File

@ -15,6 +15,7 @@
using namespace llvm;
const ppu_decoder<PPUTranslator> s_ppu_decoder;
const ppu_decoder<ppu_itype> s_ppu_itype;
const ppu_decoder<ppu_iname> s_ppu_iname;
PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_module& info, ExecutionEngine& engine)
@ -161,20 +162,60 @@ Function* PPUTranslator::Translate(const ppu_function& info)
const u64 base = m_reloc ? m_reloc->addr : 0;
m_addr = info.addr - base;
// Don't emit check in small blocks without terminator
bool need_check = info.size >= 16;
for (u32 addr = m_addr; addr < m_addr + info.size; addr += 4)
{
const u32 op = vm::read32(vm::cast(addr + base));
switch (s_ppu_itype.decode(op))
{
case ppu_itype::UNK:
case ppu_itype::ECIWX:
case ppu_itype::ECOWX:
case ppu_itype::TD:
case ppu_itype::TDI:
case ppu_itype::TW:
case ppu_itype::TWI:
case ppu_itype::B:
case ppu_itype::BC:
case ppu_itype::BCCTR:
case ppu_itype::BCLR:
case ppu_itype::SC:
{
need_check = true;
break;
}
default:
{
break;
}
}
}
m_thread = &*m_function->arg_begin();
m_base_loaded = m_ir->CreateLoad(m_base);
const auto body = BasicBlock::Create(m_context, "__body", m_function);
// Check status register in the entry block
const auto vstate = m_ir->CreateLoad(m_ir->CreateStructGEP(nullptr, m_thread, 1), true);
const auto vcheck = BasicBlock::Create(m_context, "__test", m_function);
m_ir->CreateCondBr(m_ir->CreateIsNull(vstate), body, vcheck, m_md_likely);
if (need_check)
{
// Check status register in the entry block
const auto vstate = m_ir->CreateLoad(m_ir->CreateStructGEP(nullptr, m_thread, 1), true);
const auto vcheck = BasicBlock::Create(m_context, "__test", m_function);
m_ir->CreateCondBr(m_ir->CreateIsNull(vstate), body, vcheck, m_md_likely);
// Create tail call to the check function
m_ir->SetInsertPoint(vcheck);
Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCallKind(llvm::CallInst::TCK_Tail);
m_ir->CreateRetVoid();
}
else
{
m_ir->CreateBr(body);
}
// Create tail call to the check function
m_ir->SetInsertPoint(vcheck);
Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCallKind(llvm::CallInst::TCK_Tail);
m_ir->CreateRetVoid();
m_ir->SetInsertPoint(body);
// Process blocks
@ -2990,7 +3031,7 @@ void PPUTranslator::EQV(ppu_opcode_t op)
void PPUTranslator::ECIWX(ppu_opcode_t op)
{
SetGpr(op.rd, Call(GetType<u64>(), "__eciwx", op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb)));
UNK(op);
}
void PPUTranslator::LHZUX(ppu_opcode_t op)
@ -3111,7 +3152,7 @@ void PPUTranslator::ORC(ppu_opcode_t op)
void PPUTranslator::ECOWX(ppu_opcode_t op)
{
Call(GetType<void>(), "__ecowx", op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb), GetGpr(op.rs, 32));
UNK(op);
}
void PPUTranslator::STHUX(ppu_opcode_t op)