From b4fc43d78708e65b56a15038748d318b21faaf6e Mon Sep 17 00:00:00 2001
From: Eladash <elad3356p@gmail.com>
Date: Sat, 9 Sep 2023 13:28:33 +0300
Subject: [PATCH] PPU LLVM: Re-add multi-threaded overlay module compilation

---
 Utilities/Thread.h               |  60 ++++++++++--
 rpcs3/Emu/Cell/PPUAnalyser.h     |   2 +-
 rpcs3/Emu/Cell/PPUModule.cpp     |  16 +++-
 rpcs3/Emu/Cell/PPUThread.cpp     | 160 ++++++++++++++++++++-----------
 rpcs3/Emu/Cell/lv2/sys_overlay.h |   2 +
 rpcs3/Emu/System.cpp             |   2 +-
 6 files changed, 172 insertions(+), 70 deletions(-)
diff --git a/Utilities/Thread.h b/Utilities/Thread.h
index f85f94194e..e8fba73380 100644
--- a/Utilities/Thread.h
+++ b/Utilities/Thread.h
@@ -694,7 +694,7 @@ class named_thread_group final
 {
 	using Thread = named_thread<Context>;
 
-	const u32 m_count;
+	u32 m_count = 0;
 
 	Thread* m_threads;
 
@@ -705,7 +705,7 @@ class named_thread_group final
 
 public:
 	// Lambda constructor, also the implicit deduction guide candidate
-	named_thread_group(std::string_view name, u32 count, const Context& f)
+	named_thread_group(std::string_view name, u32 count, Context&& f) noexcept
 		: m_count(count)
 		, m_threads(nullptr)
 	{
@@ -717,14 +717,60 @@ public:
 		init_threads();
 
 		// Create all threads
-		for (u32 i = 0; i < m_count; i++)
+		for (u32 i = 0; i < m_count - 1; i++)
 		{
-			new (static_cast<void*>(m_threads + i)) Thread(std::string(name) + std::to_string(i + 1), f);
+			// Copy the context
+			new (static_cast<void*>(m_threads + i)) Thread(std::string(name) + std::to_string(i + 1), static_cast<const Context&>(f));
 		}
+
+		// Move the context (if movable)
+		new (static_cast<void*>(m_threads + m_count - 1)) Thread(std::string(name) + std::to_string(m_count - 1), std::forward<Context>(f));
+	}
+
+	// Constructor with a function performed before adding more threads
+	template <typename CheckAndPrepare>
+	named_thread_group(std::string_view name, u32 count, Context&& f, CheckAndPrepare&& check) noexcept
+		: m_count(count)
+		, m_threads(nullptr)
+	{
+		if (count == 0)
+		{
+			return;
+		}
+
+		init_threads();
+		m_count = 0;
+
+		// Create all threads
+		for (u32 i = 0; i < count - 1; i++)
+		{
+			// Copy the context
+			std::remove_cvref_t<Context> context(static_cast<const Context&>(f));
+
+			// Perform the check and additional preparations for each context
+			if (!std::invoke(std::forward<CheckAndPrepare>(check), i, context))
+			{
+				return;
+			}
+
+			m_count++;
+			new (static_cast<void*>(m_threads + i)) Thread(std::string(name) + std::to_string(i + 1), std::move(context));
+		}
+
+		// Move the context (if movable)
+		std::remove_cvref_t<Context> context(std::forward<Context>(f));
+
+		if (!std::invoke(std::forward<CheckAndPrepare>(check), m_count - 1, context))
+		{
+			return;
+		}
+
+		m_count++;
+		new (static_cast<void*>(m_threads + m_count - 1)) Thread(std::string(name) + std::to_string(m_count - 1), std::move(context));
 	}
 
 	// Default constructor
-	named_thread_group(std::string_view name, u32 count)
+	named_thread_group(std::string_view name, u32 count) noexcept
 		: m_count(count)
 		, m_threads(nullptr)
 	{
@@ -791,10 +837,10 @@ public:
 		return m_count;
 	}
 
-	~named_thread_group()
+	~named_thread_group() noexcept
 	{
 		// Destroy all threads (it should join them)
-		for (u32 i = 0; i < m_count; i++)
+		for (u32 i = m_count - 1; i < m_count; i--)
 		{
 			std::launder(m_threads + i)->~Thread();
 		}
diff --git a/rpcs3/Emu/Cell/PPUAnalyser.h b/rpcs3/Emu/Cell/PPUAnalyser.h
index 492e3c012b..c1515c65c1 100644
--- a/rpcs3/Emu/Cell/PPUAnalyser.h
+++ b/rpcs3/Emu/Cell/PPUAnalyser.h
@@ -189,7 +189,7 @@ struct main_ppu_module : public ppu_module
 {
 	u32 elf_entry{};
 	u32 seg0_code_end{};
-	std::basic_string<u32> applied_pathes;
+	std::basic_string<u32> applied_patches;
 };
 
 // Aux
diff --git a/rpcs3/Emu/Cell/PPUModule.cpp b/rpcs3/Emu/Cell/PPUModule.cpp
index bfd25b2297..0de8e077b7 100644
--- a/rpcs3/Emu/Cell/PPUModule.cpp
+++ b/rpcs3/Emu/Cell/PPUModule.cpp
@@ -2460,7 +2460,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str
 
 	_main.elf_entry = static_cast<u32>(elf.header.e_entry);
 	_main.seg0_code_end = end;
-	_main.applied_pathes = applied;
+	_main.applied_patches = applied;
 
 	if (!virtual_load)
 	{
@@ -2987,13 +2987,23 @@ std::pair<std::shared_ptr<lv2_overlay>, CellError> ppu_load_overlay(const ppu_ex
 	}
 
 	ovlm->entry = static_cast<u32>(elf.header.e_entry);
+	ovlm->seg0_code_end = end;
+	ovlm->applied_patches = std::move(applied);
+
+	const bool is_being_used_in_emulation = (vm::base(ovlm->segs[0].addr) == ovlm->segs[0].ptr);
+
+	if (!is_being_used_in_emulation)
+	{
+		// Postpone to later
+		return {std::move(ovlm), {}};
+	}
 
 	const auto cpu = cpu_thread::get_current();
 
 	// Analyse executable (TODO)
-	if (!ovlm->analyse(0, ovlm->entry, end, applied, !cpu ? std::function<bool()>() : [cpu, is_being_used_in_emulation = (vm::base(ovlm->segs[0].addr) == ovlm->segs[0].ptr)]()
+	if (!ovlm->analyse(0, ovlm->entry, end, ovlm->applied_patches, !cpu ? std::function<bool()>() : [cpu]()
 	{
-		return is_being_used_in_emulation && cpu->state & cpu_flag::exit;
+		return !!(cpu->state & cpu_flag::exit);
 	}))
 	{
 		return {nullptr, CellError{CELL_CANCEL + 0u}};
diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp
index 1254c39000..b8808dfde4 100644
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@@ -3418,6 +3418,19 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
 	return ppu_store_reservation<u64>(ppu, addr, reg_value);
 }
 
+struct jit_core_allocator
+{
+	const s32 thread_count = g_cfg.core.llvm_threads ? std::min<s32>(g_cfg.core.llvm_threads, limit()) : limit();
+
+	// Initialize global semaphore with the max number of threads
+	::semaphore<0x7fffffff> sem{std::max<s32>(thread_count, 1)};
+
+	static s32 limit()
+	{
+		return static_cast<s32>(utils::get_thread_count());
+	}
+};
+
 #ifdef LLVM_AVAILABLE
 namespace
 {
@@ -3771,7 +3784,6 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 	atomic_t<usz> fnext = 0;
 
 	lf_queue<file_info> possible_exec_file_paths;
-	shared_mutex ovl_mtx;
 
 	named_thread_group workers("SPRX Worker ", std::min<u32>(utils::get_thread_count(), ::size32(file_queue)), [&]
 	{
@@ -3854,15 +3866,18 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 						break;
 					}
 
-					obj.clear(), src.close(); // Clear decrypted file and elf object memory
-
+					// Participate in thread execution limitation (takes a long time)
+					if (std::lock_guard lock(g_fxo->get<jit_core_allocator>().sem); !ovlm->analyse(0, ovlm->entry, ovlm->seg0_code_end, ovlm->applied_patches, []()
 					{
-						// Does not really require this lock, this is done for performance reasons.
-						// Seems like too many created threads is hard for Windows to manage efficiently with many CPU threads.
-						std::lock_guard lock(ovl_mtx);
-						ppu_initialize(*ovlm, false, file_size);
+						return Emu.IsStopped();
+					}))
+					{
+						// Emulation stopped
+						break;
 					}
 
+					obj.clear(), src.close(); // Clear decrypted file and elf object memory
+					ppu_initialize(*ovlm, false, file_size);
 					ppu_finalize(*ovlm);
 					break;
 				}
@@ -3910,7 +3925,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 
 			ppu_log.notice("Trying to load as executable: %s", path);
 
-			// Load MSELF, SPRX or SELF
+			// Load SELF
 			fs::file src{path};
 
 			if (!src)
@@ -3952,7 +3967,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 						break;
 					}
 
-					if (!_main.analyse(0, _main.elf_entry, _main.seg0_code_end, _main.applied_pathes, [](){ return Emu.IsStopped(); }))
+					if (!_main.analyse(0, _main.elf_entry, _main.seg0_code_end, _main.applied_patches, [](){ return Emu.IsStopped(); }))
 					{
 						g_fxo->get<spu_cache>() = std::move(current_cache);
 						break;
@@ -4004,7 +4019,7 @@ extern void ppu_initialize()
 	scoped_progress_dialog progr = "Analyzing PPU Executable...";
 
 	// Analyse executable
-	if (!_main.analyse(0, _main.elf_entry, _main.seg0_code_end, _main.applied_pathes, [](){ return Emu.IsStopped(); }))
+	if (!_main.analyse(0, _main.elf_entry, _main.seg0_code_end, _main.applied_patches, [](){ return Emu.IsStopped(); }))
 	{
 		return;
 	}
@@ -4238,19 +4253,6 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)
 		progr.emplace("Loading PPU modules...");
 	}
 
-	struct jit_core_allocator
-	{
-		const s32 thread_count = g_cfg.core.llvm_threads ? std::min<s32>(g_cfg.core.llvm_threads, limit()) : limit();
-
-		// Initialize global semaphore with the max number of threads
-		::semaphore<0x7fffffff> sem{std::max<s32>(thread_count, 1)};
-
-		static s32 limit()
-		{
-			return static_cast<s32>(utils::get_thread_count());
-		}
-	};
-
 	// Permanently loaded compiled PPU modules (name -> data)
 	jit_module& jit_mod = g_fxo->get<jit_module_manager>().get(cache_path + "_" + std::to_string(std::bit_cast<usz>(info.segs[0].ptr)));
 
@@ -4606,13 +4608,11 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)
 		g_progr_fknown_bits += file_size;
 	}
 
+	// Create worker threads for compilation
 	if (!workload.empty())
 	{
 		*progr = "Compiling PPU modules...";
-	}
 
-	// Create worker threads for compilation (TODO: how many threads)
-	{
 		u32 thread_count = rpcs3::utils::get_max_threads();
 
 		if (workload.size() < thread_count)
@@ -4625,49 +4625,93 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size)
 			atomic_t<u64> index = 0;
 		};
 
+		struct thread_op
+		{
+			atomic_t<u32>& work_cv;
+			std::vector<std::pair<std::string, ppu_module>>& workload;
+			const std::string& cache_path;
+			const cpu_thread* cpu;
+
+			std::unique_lock<decltype(jit_core_allocator::sem)> core_lock;
+
+			thread_op(atomic_t<u32>& work_cv, std::vector<std::pair<std::string, ppu_module>>& workload
+				, const cpu_thread* cpu, const std::string& cache_path, decltype(jit_core_allocator::sem)& sem) noexcept
+
+				: work_cv(work_cv)
+				, workload(workload)
+				, cache_path(cache_path)
+				, cpu(cpu)
+			{
+				// Save mutex
+				core_lock = std::unique_lock{sem, std::defer_lock};
+			}
+
+			thread_op(const thread_op& other) noexcept
+				: work_cv(other.work_cv)
+				, workload(other.workload)
+				, cache_path(other.cache_path)
+				, cpu(other.cpu)
+			{
+				if (auto mtx = other.core_lock.mutex())
+				{
+					// Save mutex
+					core_lock = std::unique_lock{*mtx, std::defer_lock};
+				}
+			}
+
+			thread_op(thread_op&& other) noexcept = default;
+
+			void operator()()
+			{
+				// Set low priority
+				thread_ctrl::scoped_priority low_prio(-1);
+
+	#ifdef __APPLE__
+				pthread_jit_write_protect_np(false);
+	#endif
+				for (u32 i = work_cv++; i < workload.size(); i = work_cv++, g_progr_pdone++)
+				{
+					if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped())
+					{
+						continue;
+					}
+
+					// Keep allocating workload
+					const auto& [obj_name, part] = std::as_const(workload)[i];
+
+					ppu_log.warning("LLVM: Compiling module %s%s", cache_path, obj_name);
+
+					// Use another JIT instance
+					jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1);
+					ppu_initialize2(jit2, part, cache_path, obj_name);
+
+					ppu_log.success("LLVM: Compiled module %s", obj_name);
+				}
+
+				core_lock.unlock();
+			}
+		};
+
 		// Prevent watchdog thread from terminating
 		g_watchdog_hold_ctr++;
 
-		named_thread_group threads(fmt::format("PPUW.%u.", ++g_fxo->get<thread_index_allocator>().index), thread_count, [&]()
+		named_thread_group threads(fmt::format("PPUW.%u.", ++g_fxo->get<thread_index_allocator>().index), thread_count
+			, thread_op(work_cv, workload, cpu, cache_path, g_fxo->get<jit_core_allocator>().sem)
+			, [&](u32 /*thread_index*/, thread_op& op)
 		{
-			// Set low priority
-			thread_ctrl::scoped_priority low_prio(-1);
+			// Allocate "core"
+			op.core_lock.lock();
 
-#ifdef __APPLE__
-			pthread_jit_write_protect_np(false);
-#endif
-			for (u32 i = work_cv++; i < workload.size(); i = work_cv++, g_progr_pdone++)
-			{
-				if (Emu.IsStopped())
-				{
-					continue;
-				}
-
-				// Keep allocating workload
-				const auto& [obj_name, part] = std::as_const(workload)[i];
-
-				// Allocate "core"
-				std::lock_guard jlock(g_fxo->get<jit_core_allocator>().sem);
-
-				if (Emu.IsStopped())
-				{
-					continue;
-				}
-
-				ppu_log.warning("LLVM: Compiling module %s%s", cache_path, obj_name);
-
-				// Use another JIT instance
-				jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1);
-				ppu_initialize2(jit2, part, cache_path, obj_name);
-
-				ppu_log.success("LLVM: Compiled module %s", obj_name);
-			}
+			// Second check before creating another thread
+			return work_cv < workload.size() && (cpu ? !cpu->state.all_of(cpu_flag::exit) : !Emu.IsStopped());
 		});
 
 		threads.join();
 
 		g_watchdog_hold_ctr--;
+	}
 
+	{
 		if (!is_being_used_in_emulation || (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped()))
 		{
 			return compiled_new;
diff --git a/rpcs3/Emu/Cell/lv2/sys_overlay.h b/rpcs3/Emu/Cell/lv2/sys_overlay.h
index 3d12d90568..103321b9e0 100644
--- a/rpcs3/Emu/Cell/lv2/sys_overlay.h
+++ b/rpcs3/Emu/Cell/lv2/sys_overlay.h
@@ -8,6 +8,8 @@ struct lv2_overlay final : lv2_obj, ppu_module
 	static const u32 id_base = 0x25000000;
 
 	u32 entry;
+	u32 seg0_code_end{};
+	std::basic_string<u32> applied_patches;
 
 	lv2_overlay() = default;
 	lv2_overlay(utils::serial&){}
diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp
index 07cdfa5ebe..a3aad6220f 100644
--- a/rpcs3/Emu/System.cpp
+++ b/rpcs3/Emu/System.cpp
@@ -1480,7 +1480,7 @@ game_boot_result Emulator::Load(const std::string& title_id, bool is_disc_patch,
 			{
 				if (auto& _main = *ensure(g_fxo->try_get<main_ppu_module>()); !_main.path.empty())
 				{
-					if (!_main.analyse(0, _main.elf_entry, _main.seg0_code_end, _main.applied_pathes, [](){ return Emu.IsStopped(); }))
+					if (!_main.analyse(0, _main.elf_entry, _main.seg0_code_end, _main.applied_patches, [](){ return Emu.IsStopped(); }))
 					{
 						return;
 					}