SPU LLVM Precompilation

Implement function SPU function discovery in images or random SPU code
This commit is contained in:
Eladash 2023-08-26 11:23:42 +03:00 committed by Elad Ashkenazi
parent 290ff5b839
commit b5faf5800b
12 changed files with 352 additions and 22 deletions

View File

@ -1107,6 +1107,13 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu_segment&
if (prog.p_type == 0x1u /* LOAD */ && prog.p_filesz > 0u)
{
if (prog.p_vaddr)
{
extern void utilize_spu_data_segment(u32 vaddr, const void* ls_data_vaddr, u32 size);
utilize_spu_data_segment(prog.p_vaddr, (elf_header + prog.p_offset), prog.p_filesz);
}
sha1_update(&sha2, (elf_header + prog.p_offset), prog.p_filesz);
}
@ -1119,7 +1126,7 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu_segment&
if (!name.empty())
{
fmt::append(dump, "\n\tSPUNAME: '%s'", name);
fmt::append(dump, "\n\tSPUNAME: '%s' (image addr: 0x%x)", name, seg.addr + i);
}
}
}

View File

@ -4030,7 +4030,7 @@ extern void ppu_initialize()
const std::string mount_point = vfs::get("/dev_flash/");
bool dev_flash_located = !Emu.GetCat().ends_with('P') && Emu.IsPathInsideDir(Emu.GetBoot(), mount_point) && g_cfg.core.ppu_llvm_precompilation;
bool dev_flash_located = !Emu.GetCat().ends_with('P') && Emu.IsPathInsideDir(Emu.GetBoot(), mount_point) && g_cfg.core.llvm_precompilation;
if (compile_fw || dev_flash_located)
{
@ -4050,7 +4050,7 @@ extern void ppu_initialize()
}
// Avoid compilation if main's cache exists or it is a standalone SELF with no PARAM.SFO
if (compile_main && g_cfg.core.ppu_llvm_precompilation && !Emu.GetTitleID().empty() && !Emu.IsChildProcess())
if (compile_main && g_cfg.core.llvm_precompilation && !Emu.GetTitleID().empty() && !Emu.IsChildProcess())
{
// Try to add all related directories
const std::set<std::string> dirs = Emu.GetGameDirs();

View File

@ -382,6 +382,18 @@ void spu_load_exec(const spu_exec_object& elf)
spu->status_npc = {SPU_STATUS_RUNNING, elf.header.e_entry};
atomic_storage<u32>::release(spu->pc, elf.header.e_entry);
const auto funcs = spu->discover_functions(spu->ls, umax);
for (u32 addr : funcs)
{
spu_log.success("Found SPU function at: 0x%08x", addr);
}
if (!funcs.empty())
{
spu_log.success("Found %u SPU functions", funcs.size());
}
}
void spu_load_rel_exec(const spu_rel_object& elf)

View File

@ -26,17 +26,17 @@ union spu_opcode_t
bf_t<u32, 7, 18> i18; // 7..24
};
inline u32 spu_branch_target(u32 pc, u32 imm = 0)
constexpr u32 spu_branch_target(u32 pc, u32 imm = 0)
{
return (pc + (imm << 2)) & 0x3fffc;
}
inline u32 spu_ls_target(u32 pc, u32 imm = 0)
constexpr u32 spu_ls_target(u32 pc, u32 imm = 0)
{
return (pc + (imm << 2)) & 0x3fff0;
}
inline u32 spu_decode(u32 inst)
constexpr u32 spu_decode(u32 inst)
{
return inst >> 21;
}

View File

@ -516,6 +516,84 @@ spu_cache::~spu_cache()
{
}
struct spu_section_data
{
struct data_t
{
u32 vaddr;
std::basic_string<u32> inst_data;
std::vector<u32> funcs;
};
shared_mutex mtx;
atomic_t<bool> had_been_used = false;
std::vector<data_t> data;
};
extern void utilize_spu_data_segment(u32 vaddr, const void* ls_data_vaddr, u32 size)
{
if (vaddr % 4)
{
return;
}
size &= -4;
if (!size || vaddr + size > SPU_LS_SIZE)
{
return;
}
if (!g_cfg.core.llvm_precompilation)
{
return;
}
g_fxo->need<spu_section_data>();
if (g_fxo->get<spu_section_data>().had_been_used)
{
return;
}
std::basic_string<u32> data(size / 4, 0);
std::memcpy(data.data(), ls_data_vaddr, size);
spu_section_data::data_t obj{vaddr, std::move(data)};
std::vector<u8> ls_data(SPU_LS_SIZE);
std::memcpy(ls_data.data() + vaddr, ls_data_vaddr, size);
obj.funcs = spu_thread::discover_functions(ls_data.data(), umax);
if (obj.funcs.empty())
{
// Nothing to add
return;
}
for (u32 addr : obj.funcs)
{
spu_log.notice("Found SPU function at: 0x%05x", addr);
}
spu_log.notice("Found %u SPU functions", obj.funcs.size());
std::lock_guard lock(g_fxo->get<spu_section_data>().mtx);
for (const auto& data : g_fxo->get<spu_section_data>().data)
{
// TODO: More robust duplicates filtering
if (data.vaddr == vaddr && data.inst_data.starts_with(obj.inst_data))
{
spu_log.notice("Avoided duplicate SPU segment");
return;
}
}
g_fxo->get<spu_section_data>().data.emplace_back(std::move(obj));
}
std::deque<spu_program> spu_cache::get()
{
std::deque<spu_program> result;
@ -618,6 +696,11 @@ void spu_cache::initialize()
atomic_t<usz> fnext{};
atomic_t<u8> fail_flag{0};
auto data_list = std::move(g_fxo->get<spu_section_data>().data);
g_fxo->get<spu_section_data>().had_been_used = true;
atomic_t<usz> data_indexer{};
if (g_cfg.core.spu_decoder == spu_decoder_type::dynamic || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
if (auto compiler = spu_recompiler_base::make_llvm_recompiler(11))
@ -657,7 +740,18 @@ void spu_cache::initialize()
thread_ctrl::wait_on(g_progr_ptotal, v);
}
g_progr_ptotal += ::size32(func_list);
u32 add_count = ::size32(func_list);
if (func_list.empty())
{
for (auto& sec : data_list)
{
add_count += sec.funcs.size();
}
}
g_progr_ptotal += add_count;
progr.emplace("Building SPU cache...");
worker_count = rpcs3::utils::get_max_threads();
@ -744,6 +838,7 @@ void spu_cache::initialize()
{
// Likely, out of JIT memory. Signal to prevent further building.
fail_flag |= 1;
continue;
}
// Clear fake LS
@ -752,6 +847,107 @@ void spu_cache::initialize()
result++;
}
if (!func_list.empty() || !g_cfg.core.llvm_precompilation)
{
// Cache has already been initiated or the user does not want to precompile SPU programs
return result;
}
u32 last_sec_idx = umax;
for (usz func_i = data_indexer++;; func_i = data_indexer++, g_progr_pdone++)
{
u32 passed_count = 0;
u32 func_addr = 0;
u32 sec_addr = umax;
u32 sec_idx = 0;
std::basic_string_view<u32> inst_data;
// Try to get the data this index points to
for (auto& sec : data_list)
{
if (func_i < passed_count + sec.funcs.size())
{
sec_addr = sec.vaddr;
func_addr = ::at32(sec.funcs, func_i - passed_count);
inst_data = sec.inst_data;
break;
}
passed_count += sec.funcs.size();
sec_idx++;
}
if (sec_addr == umax)
{
// End of compilation for thread
break;
}
if (Emu.IsStopped() || fail_flag)
{
continue;
}
if (last_sec_idx != sec_idx)
{
if (last_sec_idx != umax)
{
// Clear fake LS of previous section
auto& sec = data_list[last_sec_idx];
std::memset(ls.data() + sec.vaddr / 4, 0, sec.inst_data.size() * 4);
}
// Initialize LS with the entire section data
for (u32 i = 0, pos = sec_addr; i < inst_data.size(); i++, pos += 4)
{
ls[pos / 4] = std::bit_cast<be_t<u32>>(inst_data[i]);
}
last_sec_idx = sec_idx;
}
// Call analyser
spu_program func2 = compiler->analyse(ls.data(), func_addr);
while (!func2.data.empty())
{
const u32 last_inst = std::bit_cast<be_t<u32>>(func2.data.back());
const u32 prog_size = func2.data.size();
if (!compiler->compile(std::move(func2)))
{
// Likely, out of JIT memory. Signal to prevent further building.
fail_flag |= 1;
break;
}
result++;
if (g_cfg.core.spu_block_size >= spu_block_size_type::mega)
{
// Should already take care of the entire function
break;
}
if (auto type = g_spu_itype.decode(last_inst);
type == spu_itype::BRSL || type == spu_itype::BRASL || type == spu_itype::BISL)
{
const u32 start_new = func_addr + prog_size * 4;
if (start_new < SPU_LS_SIZE && ls[start_new / 4] && g_spu_itype.decode(ls[start_new / 4]) != spu_itype::UNK)
{
spu_log.notice("Precompiling fallthrough to 0x%05x", start_new);
func2 = compiler->analyse(ls.data(), start_new);
func_addr = start_new;
continue;
}
}
break;
}
}
return result;
});
@ -1904,6 +2100,63 @@ void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* /*rip*/
}
}
std::vector<u32> spu_thread::discover_functions(const void* ls_start, u32 /*entry*/)
{
std::vector<u32> calls;
calls.reserve(100);
// Discover functions
// Use the most simple method: search for instructions that calls them
// And then filter invalid cases (does not detect tail calls)
for (u32 i = 0x10; i < SPU_LS_SIZE; i += 0x10)
{
// Search for BRSL and BRASL
// TODO: BISL
const v128 inst = read_from_ptr<be_t<v128>>(static_cast<const u8*>(ls_start), i);
const v128 shifted = gv_shr32(inst, 23);
const v128 eq_brsl = gv_eq32(shifted, v128::from32p(0x66));
const v128 eq_brasl = gv_eq32(shifted, v128::from32p(0x62));
const v128 result = eq_brsl | eq_brasl;
if (!gv_testz(result))
{
for (u32 j = 0; j < 4; j++)
{
if (result.u32r[j])
{
calls.push_back(i + j * 4);
}
}
}
}
calls.erase(std::remove_if(calls.begin(), calls.end(), [&](u32 caller)
{
// Check the validity of both the callee code and the following caller code
return !is_exec_code(caller, ls_start) || !is_exec_code(caller + 4, ls_start);
}), calls.end());
std::vector<u32> addrs;
for (u32 addr : calls)
{
const spu_opcode_t op{read_from_ptr<be_t<u32>>(static_cast<const u8*>(ls_start), addr)};
const u32 func = op_branch_targets(addr, op)[0];
if (func == umax || std::count(addrs.begin(), addrs.end(), func))
{
continue;
}
addrs.push_back(func);
}
std::sort(addrs.begin(), addrs.end());
return addrs;
}
spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
{
// Result: addr + raw instruction data
@ -2647,6 +2900,8 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
}
}
spu_program result2 = result;
while (lsa > 0 || limit < 0x40000)
{
const u32 initial_size = ::size32(result.data);
@ -3093,7 +3348,13 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
{
workload.clear();
workload.push_back(entry_point);
ensure(m_bbs.count(entry_point));
if (!m_bbs.count(entry_point))
{
std::string func_bad;
dump(result2, func_bad);
spu_log.error("%s", func_bad);
return {};
}
std::basic_string<u32> new_entries;

View File

@ -487,7 +487,27 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
case spu_itype::BRASL:
{
const int index = (type == spu_itype::BR || type == spu_itype::BRA || type == spu_itype::BRSL || type == spu_itype::BRASL ? 0 : 1);
// if (type == spu_itype::BRASL || type == spu_itype::BRA)
// {
// res[index] = spu_branch_target(0, op.i16);
// }
// else
// {
// // Treat i16 as signed, this allows the caller to detect "overflows" and "underflows" in address in order to detect invalid branches
// // Example:
// // [0x3fffc] BR +4 -> BR 0 -> invalid
// // [0x3fffc] BR 0x3fff4 -> BR 0 -> invalid
// const u32 add = static_cast<s16>(op.si16);
// }
res[index] = (spu_branch_target(type == spu_itype::BRASL || type == spu_itype::BRA ? 0 : pc, op.i16));
if (res[0] == res[1])
{
res[1] = umax;
}
break;
}
case spu_itype::IRET:
@ -4013,7 +4033,7 @@ bool spu_thread::check_mfc_interrupts(u32 next_pc)
return false;
}
bool spu_thread::is_exec_code(u32 addr, const u8* ls_ptr)
bool spu_thread::is_exec_code(u32 addr, const void* ls_ptr)
{
if (addr & ~0x3FFFC)
{
@ -4022,8 +4042,8 @@ bool spu_thread::is_exec_code(u32 addr, const u8* ls_ptr)
for (u32 i = 0; i < 30; i++)
{
const u32 addr0 = addr + (i * 4);
const u32 op = read_from_ptr<be_t<u32>>(ls_ptr + addr0);
const u32 addr0 = spu_branch_target(addr);
const u32 op = read_from_ptr<be_t<u32>>(static_cast<const u8*>(ls_ptr) + addr0);
const auto type = s_spu_itype.decode(op);
if (type == spu_itype::UNK || !op)
@ -4033,9 +4053,38 @@ bool spu_thread::is_exec_code(u32 addr, const u8* ls_ptr)
if (type & spu_itype::branch)
{
// TODO
break;
const auto results = op_branch_targets(addr, spu_opcode_t{op});
if (results[0] == umax)
{
break;
}
for (usz res_i = 1; res_i < results.size(); res_i++)
{
const u32 route_pc = results[res_i];
if (route_pc >= SPU_LS_SIZE)
{
continue;
}
// Test the validity of a single instruction of the optional target
// This function can't be too slow and is unlikely to improve results by a great deal
const u32 op0 = read_from_ptr<be_t<u32>>(static_cast<const u8*>(ls_ptr) + route_pc);
const auto type0 = s_spu_itype.decode(op);
if (type == spu_itype::UNK || !op)
{
return false;
}
}
addr = spu_branch_target(results[0]);
continue;
}
addr += 4;
}
return true;

View File

@ -825,7 +825,8 @@ public:
void set_events(u32 bits);
void set_interrupt_status(bool enable);
bool check_mfc_interrupts(u32 next_pc);
static bool is_exec_code(u32 addr, const u8* ls_ptr); // Only a hint, do not rely on it other than debugging purposes
static bool is_exec_code(u32 addr, const void* ls_ptr); // Only a hint, do not rely on it other than debugging purposes
static std::vector<u32> discover_functions(const void* ls_start, u32 /*entry*/);
u32 get_ch_count(u32 ch);
s64 get_ch_value(u32 ch);
bool set_ch_value(u32 ch, u32 value);

View File

@ -28,7 +28,7 @@ struct cfg_root : cfg::node
cfg::string llvm_cpu{ this, "Use LLVM CPU" };
cfg::_int<0, 1024> llvm_threads{ this, "Max LLVM Compile Threads", 0 };
cfg::_bool ppu_llvm_greedy_mode{ this, "PPU LLVM Greedy Mode", false, false };
cfg::_bool ppu_llvm_precompilation{ this, "PPU LLVM Precompilation", true };
cfg::_bool llvm_precompilation{ this, "LLVM Precompilation", true };
cfg::_enum<thread_scheduler_mode> thread_scheduler{this, "Thread Scheduler Mode", thread_scheduler_mode::os};
cfg::_bool set_daz_and_ftz{ this, "Set DAZ and FTZ", false };
cfg::_enum<spu_decoder_type> spu_decoder{ this, "SPU Decoder", spu_decoder_type::llvm };

View File

@ -19,7 +19,7 @@ enum class emu_settings_type
SPUDebug,
MFCDebug,
MaxLLVMThreads,
PPULLVMPrecompilation,
LLVMPrecompilation,
EnableTSX,
AccurateGETLLAR,
AccurateSpuDMA,
@ -204,7 +204,7 @@ inline static const QMap<emu_settings_type, cfg_location> settings_location =
{ emu_settings_type::SPUDebug, { "Core", "SPU Debug"}},
{ emu_settings_type::MFCDebug, { "Core", "MFC Debug"}},
{ emu_settings_type::MaxLLVMThreads, { "Core", "Max LLVM Compile Threads"}},
{ emu_settings_type::PPULLVMPrecompilation, { "Core", "PPU LLVM Precompilation"}},
{ emu_settings_type::LLVMPrecompilation, { "Core", "LLVM Precompilation"}},
{ emu_settings_type::EnableTSX, { "Core", "Enable TSX"}},
{ emu_settings_type::AccurateGETLLAR, { "Core", "Accurate GETLLAR"}},
{ emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}},

View File

@ -1452,8 +1452,8 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
m_emu_settings->EnhanceCheckBox(ui->fixupPPUVNAN, emu_settings_type::FixupPPUVNAN);
SubscribeTooltip(ui->fixupPPUVNAN, tooltips.settings.fixup_ppuvnan);
m_emu_settings->EnhanceCheckBox(ui->ppuPrecompilation, emu_settings_type::PPULLVMPrecompilation);
SubscribeTooltip(ui->ppuPrecompilation, tooltips.settings.ppu_precompilation);
m_emu_settings->EnhanceCheckBox(ui->llvmPrecompilation, emu_settings_type::LLVMPrecompilation);
SubscribeTooltip(ui->llvmPrecompilation, tooltips.settings.llvm_precompilation);
m_emu_settings->EnhanceCheckBox(ui->suspendSavestates, emu_settings_type::SuspendEmulationSavestateMode);
SubscribeTooltip(ui->suspendSavestates, tooltips.settings.suspend_savestates);

View File

@ -2394,9 +2394,9 @@
</widget>
</item>
<item>
<widget class="QCheckBox" name="ppuPrecompilation">
<widget class="QCheckBox" name="llvmPrecompilation">
<property name="text">
<string>PPU LLVM Precompilation</string>
<string>PPU/SPU LLVM Precompilation</string>
</property>
</widget>
</item>

View File

@ -75,7 +75,7 @@ public:
const QString ppu__static = tr("Interpreter (slow). Try this if PPU Recompiler (LLVM) doesn't work.");
const QString ppu_dynamic = tr("Alternative interpreter (slow). May be faster than static interpreter. Try this if PPU Recompiler (LLVM) doesn't work.");
const QString ppu_llvm = tr("Recompiles and caches the game's PPU code using the LLVM Recompiler once before running it for the first time.\nThis is by far the fastest option and should always be used.\nShould you face compatibility issues, fall back to one of the Interpreters and retry.\nIf unsure, use this option.");
const QString ppu_precompilation = tr("Searches the game's directory and precompiles extra PPU modules during boot.\nIf disabled, these modules will only be compiled when needed. Depending on the game, this might interrupt the gameplay unexpectedly and possibly frequently.\nOnly disable this if you want to get ingame more quickly.");
const QString llvm_precompilation = tr("Searches the game's directory and precompiles extra PPU and SPU modules during boot.\nIf disabled, these modules will only be compiled when needed. Depending on the game, this might interrupt the gameplay unexpectedly and possibly frequently.\nOnly disable this if you want to get ingame more quickly.");
const QString spu__static = tr("Interpreter (slow). Try this if SPU Recompiler (LLVM) doesn't work.");
const QString spu_dynamic = tr("Alternative interpreter (slow). May be faster than static interpreter. Try this if SPU Recompiler (LLVM) doesn't work.");
const QString spu_asmjit = tr("Recompiles the game's SPU code using the ASMJIT Recompiler.\nThis is the fast option with very good compatibility.\nIf unsure, use this option.");