From b4fc43d78708e65b56a15038748d318b21faaf6e Mon Sep 17 00:00:00 2001 From: Eladash Date: Sat, 9 Sep 2023 13:28:33 +0300 Subject: [PATCH] PPU LLVM: Re-add multi-threaded overlay module compilation --- Utilities/Thread.h | 60 ++++++++++-- rpcs3/Emu/Cell/PPUAnalyser.h | 2 +- rpcs3/Emu/Cell/PPUModule.cpp | 16 +++- rpcs3/Emu/Cell/PPUThread.cpp | 160 ++++++++++++++++++++----------- rpcs3/Emu/Cell/lv2/sys_overlay.h | 2 + rpcs3/Emu/System.cpp | 2 +- 6 files changed, 172 insertions(+), 70 deletions(-) diff --git a/Utilities/Thread.h b/Utilities/Thread.h index f85f94194e..e8fba73380 100644 --- a/Utilities/Thread.h +++ b/Utilities/Thread.h @@ -694,7 +694,7 @@ class named_thread_group final { using Thread = named_thread; - const u32 m_count; + u32 m_count = 0; Thread* m_threads; @@ -705,7 +705,7 @@ class named_thread_group final public: // Lambda constructor, also the implicit deduction guide candidate - named_thread_group(std::string_view name, u32 count, const Context& f) + named_thread_group(std::string_view name, u32 count, Context&& f) noexcept : m_count(count) , m_threads(nullptr) { @@ -717,14 +717,60 @@ public: init_threads(); // Create all threads - for (u32 i = 0; i < m_count; i++) + for (u32 i = 0; i < m_count - 1; i++) { - new (static_cast(m_threads + i)) Thread(std::string(name) + std::to_string(i + 1), f); + // Copy the context + new (static_cast(m_threads + i)) Thread(std::string(name) + std::to_string(i + 1), static_cast(f)); } + + // Move the context (if movable) + new (static_cast(m_threads + m_count - 1)) Thread(std::string(name) + std::to_string(m_count - 1), std::forward(f)); + } + + // Constructor with a function performed before adding more threads + template + named_thread_group(std::string_view name, u32 count, Context&& f, CheckAndPrepare&& check) noexcept + : m_count(count) + , m_threads(nullptr) + { + if (count == 0) + { + return; + } + + init_threads(); + m_count = 0; + + // Create all threads + for (u32 i = 0; i < count - 1; i++) + { + // Copy the context + std::remove_cvref_t context(static_cast(f)); + + // Perform the check and additional preparations for each context + if (!std::invoke(std::forward(check), i, context)) + { + return; + } + + m_count++; + new (static_cast(m_threads + i)) Thread(std::string(name) + std::to_string(i + 1), std::move(context)); + } + + // Move the context (if movable) + std::remove_cvref_t context(std::forward(f)); + + if (!std::invoke(std::forward(check), m_count - 1, context)) + { + return; + } + + m_count++; + new (static_cast(m_threads + m_count - 1)) Thread(std::string(name) + std::to_string(m_count - 1), std::move(context)); } // Default constructor - named_thread_group(std::string_view name, u32 count) + named_thread_group(std::string_view name, u32 count) noexcept : m_count(count) , m_threads(nullptr) { @@ -791,10 +837,10 @@ public: return m_count; } - ~named_thread_group() + ~named_thread_group() noexcept { // Destroy all threads (it should join them) - for (u32 i = 0; i < m_count; i++) + for (u32 i = m_count - 1; i < m_count; i--) { std::launder(m_threads + i)->~Thread(); } diff --git a/rpcs3/Emu/Cell/PPUAnalyser.h b/rpcs3/Emu/Cell/PPUAnalyser.h index 492e3c012b..c1515c65c1 100644 --- a/rpcs3/Emu/Cell/PPUAnalyser.h +++ b/rpcs3/Emu/Cell/PPUAnalyser.h @@ -189,7 +189,7 @@ struct main_ppu_module : public ppu_module { u32 elf_entry{}; u32 seg0_code_end{}; - std::basic_string applied_pathes; + std::basic_string applied_patches; }; // Aux diff --git a/rpcs3/Emu/Cell/PPUModule.cpp b/rpcs3/Emu/Cell/PPUModule.cpp index bfd25b2297..0de8e077b7 100644 --- a/rpcs3/Emu/Cell/PPUModule.cpp +++ b/rpcs3/Emu/Cell/PPUModule.cpp @@ -2460,7 +2460,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str _main.elf_entry = static_cast(elf.header.e_entry); _main.seg0_code_end = end; - _main.applied_pathes = applied; + _main.applied_patches = applied; if (!virtual_load) { @@ -2987,13 +2987,23 @@ std::pair, CellError> ppu_load_overlay(const ppu_ex } ovlm->entry = static_cast(elf.header.e_entry); + ovlm->seg0_code_end = end; + ovlm->applied_patches = std::move(applied); + + const bool is_being_used_in_emulation = (vm::base(ovlm->segs[0].addr) == ovlm->segs[0].ptr); + + if (!is_being_used_in_emulation) + { + // Postpone to later + return {std::move(ovlm), {}}; + } const auto cpu = cpu_thread::get_current(); // Analyse executable (TODO) - if (!ovlm->analyse(0, ovlm->entry, end, applied, !cpu ? std::function() : [cpu, is_being_used_in_emulation = (vm::base(ovlm->segs[0].addr) == ovlm->segs[0].ptr)]() + if (!ovlm->analyse(0, ovlm->entry, end, ovlm->applied_patches, !cpu ? std::function() : [cpu]() { - return is_being_used_in_emulation && cpu->state & cpu_flag::exit; + return !!(cpu->state & cpu_flag::exit); })) { return {nullptr, CellError{CELL_CANCEL + 0u}}; diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 1254c39000..b8808dfde4 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -3418,6 +3418,19 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value) return ppu_store_reservation(ppu, addr, reg_value); } +struct jit_core_allocator +{ + const s32 thread_count = g_cfg.core.llvm_threads ? std::min(g_cfg.core.llvm_threads, limit()) : limit(); + + // Initialize global semaphore with the max number of threads + ::semaphore<0x7fffffff> sem{std::max(thread_count, 1)}; + + static s32 limit() + { + return static_cast(utils::get_thread_count()); + } +}; + #ifdef LLVM_AVAILABLE namespace { @@ -3771,7 +3784,6 @@ extern void ppu_precompile(std::vector& dir_queue, std::vector fnext = 0; lf_queue possible_exec_file_paths; - shared_mutex ovl_mtx; named_thread_group workers("SPRX Worker ", std::min(utils::get_thread_count(), ::size32(file_queue)), [&] { @@ -3854,15 +3866,18 @@ extern void ppu_precompile(std::vector& dir_queue, std::vectorget().sem); !ovlm->analyse(0, ovlm->entry, ovlm->seg0_code_end, ovlm->applied_patches, []() { - // Does not really require this lock, this is done for performance reasons. - // Seems like too many created threads is hard for Windows to manage efficiently with many CPU threads. - std::lock_guard lock(ovl_mtx); - ppu_initialize(*ovlm, false, file_size); + return Emu.IsStopped(); + })) + { + // Emulation stopped + break; } + obj.clear(), src.close(); // Clear decrypted file and elf object memory + ppu_initialize(*ovlm, false, file_size); ppu_finalize(*ovlm); break; } @@ -3910,7 +3925,7 @@ extern void ppu_precompile(std::vector& dir_queue, std::vector& dir_queue, std::vectorget() = std::move(current_cache); break; @@ -4004,7 +4019,7 @@ extern void ppu_initialize() scoped_progress_dialog progr = "Analyzing PPU Executable..."; // Analyse executable - if (!_main.analyse(0, _main.elf_entry, _main.seg0_code_end, _main.applied_pathes, [](){ return Emu.IsStopped(); })) + if (!_main.analyse(0, _main.elf_entry, _main.seg0_code_end, _main.applied_patches, [](){ return Emu.IsStopped(); })) { return; } @@ -4238,19 +4253,6 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) progr.emplace("Loading PPU modules..."); } - struct jit_core_allocator - { - const s32 thread_count = g_cfg.core.llvm_threads ? std::min(g_cfg.core.llvm_threads, limit()) : limit(); - - // Initialize global semaphore with the max number of threads - ::semaphore<0x7fffffff> sem{std::max(thread_count, 1)}; - - static s32 limit() - { - return static_cast(utils::get_thread_count()); - } - }; - // Permanently loaded compiled PPU modules (name -> data) jit_module& jit_mod = g_fxo->get().get(cache_path + "_" + std::to_string(std::bit_cast(info.segs[0].ptr))); @@ -4606,13 +4608,11 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) g_progr_fknown_bits += file_size; } + // Create worker threads for compilation if (!workload.empty()) { *progr = "Compiling PPU modules..."; - } - // Create worker threads for compilation (TODO: how many threads) - { u32 thread_count = rpcs3::utils::get_max_threads(); if (workload.size() < thread_count) @@ -4625,49 +4625,93 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) atomic_t index = 0; }; + struct thread_op + { + atomic_t& work_cv; + std::vector>& workload; + const std::string& cache_path; + const cpu_thread* cpu; + + std::unique_lock core_lock; + + thread_op(atomic_t& work_cv, std::vector>& workload + , const cpu_thread* cpu, const std::string& cache_path, decltype(jit_core_allocator::sem)& sem) noexcept + + : work_cv(work_cv) + , workload(workload) + , cache_path(cache_path) + , cpu(cpu) + { + // Save mutex + core_lock = std::unique_lock{sem, std::defer_lock}; + } + + thread_op(const thread_op& other) noexcept + : work_cv(other.work_cv) + , workload(other.workload) + , cache_path(other.cache_path) + , cpu(other.cpu) + { + if (auto mtx = other.core_lock.mutex()) + { + // Save mutex + core_lock = std::unique_lock{*mtx, std::defer_lock}; + } + } + + thread_op(thread_op&& other) noexcept = default; + + void operator()() + { + // Set low priority + thread_ctrl::scoped_priority low_prio(-1); + + #ifdef __APPLE__ + pthread_jit_write_protect_np(false); + #endif + for (u32 i = work_cv++; i < workload.size(); i = work_cv++, g_progr_pdone++) + { + if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped()) + { + continue; + } + + // Keep allocating workload + const auto& [obj_name, part] = std::as_const(workload)[i]; + + ppu_log.warning("LLVM: Compiling module %s%s", cache_path, obj_name); + + // Use another JIT instance + jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1); + ppu_initialize2(jit2, part, cache_path, obj_name); + + ppu_log.success("LLVM: Compiled module %s", obj_name); + } + + core_lock.unlock(); + } + }; + // Prevent watchdog thread from terminating g_watchdog_hold_ctr++; - named_thread_group threads(fmt::format("PPUW.%u.", ++g_fxo->get().index), thread_count, [&]() + named_thread_group threads(fmt::format("PPUW.%u.", ++g_fxo->get().index), thread_count + , thread_op(work_cv, workload, cpu, cache_path, g_fxo->get().sem) + , [&](u32 /*thread_index*/, thread_op& op) { - // Set low priority - thread_ctrl::scoped_priority low_prio(-1); + // Allocate "core" + op.core_lock.lock(); -#ifdef __APPLE__ - pthread_jit_write_protect_np(false); -#endif - for (u32 i = work_cv++; i < workload.size(); i = work_cv++, g_progr_pdone++) - { - if (Emu.IsStopped()) - { - continue; - } - - // Keep allocating workload - const auto& [obj_name, part] = std::as_const(workload)[i]; - - // Allocate "core" - std::lock_guard jlock(g_fxo->get().sem); - - if (Emu.IsStopped()) - { - continue; - } - - ppu_log.warning("LLVM: Compiling module %s%s", cache_path, obj_name); - - // Use another JIT instance - jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1); - ppu_initialize2(jit2, part, cache_path, obj_name); - - ppu_log.success("LLVM: Compiled module %s", obj_name); - } + // Second check before creating another thread + return work_cv < workload.size() && (cpu ? !cpu->state.all_of(cpu_flag::exit) : !Emu.IsStopped()); }); threads.join(); g_watchdog_hold_ctr--; + } + { if (!is_being_used_in_emulation || (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped())) { return compiled_new; diff --git a/rpcs3/Emu/Cell/lv2/sys_overlay.h b/rpcs3/Emu/Cell/lv2/sys_overlay.h index 3d12d90568..103321b9e0 100644 --- a/rpcs3/Emu/Cell/lv2/sys_overlay.h +++ b/rpcs3/Emu/Cell/lv2/sys_overlay.h @@ -8,6 +8,8 @@ struct lv2_overlay final : lv2_obj, ppu_module static const u32 id_base = 0x25000000; u32 entry; + u32 seg0_code_end{}; + std::basic_string applied_patches; lv2_overlay() = default; lv2_overlay(utils::serial&){} diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp index 07cdfa5ebe..a3aad6220f 100644 --- a/rpcs3/Emu/System.cpp +++ b/rpcs3/Emu/System.cpp @@ -1480,7 +1480,7 @@ game_boot_result Emulator::Load(const std::string& title_id, bool is_disc_patch, { if (auto& _main = *ensure(g_fxo->try_get()); !_main.path.empty()) { - if (!_main.analyse(0, _main.elf_entry, _main.seg0_code_end, _main.applied_pathes, [](){ return Emu.IsStopped(); })) + if (!_main.analyse(0, _main.elf_entry, _main.seg0_code_end, _main.applied_patches, [](){ return Emu.IsStopped(); })) { return; }