LV2: Optimizations and fixes

Fix and optimize sys_ppu_thread_yield

Fix LV2 syscalls with timeout bug. (use ppu_thread::cancel_sleep instead)

Move timeout notification out of mutex scope

Allow g_waiting timeouts to be awaked in scope
This commit is contained in:
Elad Ashkenazi 2022-08-08 08:33:49 +03:00 committed by Ivan
parent cba4c3cdc4
commit c4cc0154be
10 changed files with 260 additions and 202 deletions

View File

@ -700,6 +700,12 @@ bool cpu_thread::check_state() noexcept
store = true;
}
if (flags & cpu_flag::notify)
{
flags -= cpu_flag::notify;
store = true;
}
// Can't process dbg_step if we only paused temporarily
if (cpu_can_stop && flags & cpu_flag::dbg_step)
{
@ -779,6 +785,8 @@ bool cpu_thread::check_state() noexcept
if ((state1 ^ state) - pending_and_temp)
{
// Work could have changed flags
// Reset internal flags as if check_state() has just been called
cpu_sleep_called = false;
continue;
}
}

View File

@ -24,6 +24,7 @@ enum class cpu_flag : u32
memory, // Thread must unlock memory mutex
pending, // Thread has postponed work
pending_recheck, // Thread needs to recheck if there is pending work before ::pending removal
notify, // Flag meant solely to allow atomic notification on state without changing other flags
dbg_global_pause, // Emulation paused
dbg_pause, // Thread paused
@ -174,7 +175,7 @@ public:
virtual void cpu_sleep() {}
// Callback for cpu_flag::pending
virtual void cpu_work() {}
virtual void cpu_work() { state -= cpu_flag::pending + cpu_flag::pending_recheck; }
// Callback for cpu_flag::ret
virtual void cpu_return() {}

View File

@ -270,6 +270,8 @@ public:
alignas(64) const ppu_func_opd_t entry_func;
u64 start_time{0}; // Sleep start timepoint
u64 end_time{umax}; // Sleep end timepoint
s32 cancel_sleep{0}; // Flag to cancel the next lv2_obj::sleep call (when equals 2)
u64 syscall_args[8]{0}; // Last syscall arguments stored
const char* current_function{}; // Current function name for diagnosis, optimized for speed.
const char* last_function{}; // Sticky copy of current_function, is not cleared on function return

View File

@ -1206,24 +1206,27 @@ namespace cpu_counter
void remove(cpu_thread*) noexcept;
}
void lv2_obj::sleep(cpu_thread& cpu, const u64 timeout)
bool lv2_obj::sleep(cpu_thread& cpu, const u64 timeout)
{
// Should already be performed when using this flag
if (!g_postpone_notify_barrier)
{
prepare_for_sleep(cpu);
}
bool result = false;
const u64 current_time = get_guest_system_time();
{
std::lock_guard lock{g_mutex};
sleep_unlocked(cpu, timeout);
result = sleep_unlocked(cpu, timeout, current_time);
if (!g_to_awake.empty())
{
// Schedule pending entries
awake_unlocked({});
}
schedule_all();
schedule_all(current_time);
}
if (!g_postpone_notify_barrier)
@ -1232,6 +1235,7 @@ void lv2_obj::sleep(cpu_thread& cpu, const u64 timeout)
}
g_to_awake.clear();
return result;
}
bool lv2_obj::awake(cpu_thread* thread, s32 prio)
@ -1261,19 +1265,23 @@ bool lv2_obj::awake(cpu_thread* thread, s32 prio)
bool lv2_obj::yield(cpu_thread& thread)
{
vm::temporary_unlock(thread);
if (auto ppu = thread.try_get<ppu_thread>())
{
ppu->raddr = 0; // Clear reservation
if (!atomic_storage<ppu_thread*>::load(ppu->next_ppu))
{
// Nothing to do
return false;
}
}
return awake(&thread, yield_cmd);
}
void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
bool lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout, u64 current_time)
{
const u64 start_time = get_guest_system_time();
const u64 start_time = current_time;
auto on_to_sleep_update = [&]()
{
@ -1299,15 +1307,32 @@ void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
}
};
bool return_val = true;
if (auto ppu = thread.try_get<ppu_thread>())
{
ppu_log.trace("sleep() - waiting (%zu)", g_pending);
const auto [_ ,ok] = ppu->state.fetch_op([&](bs_t<cpu_flag>& val)
if (ppu->ack_suspend)
{
ppu->ack_suspend = false;
g_pending--;
}
if (std::exchange(ppu->cancel_sleep, 0) == 2)
{
// Signal that the underlying LV2 operation has been cancelled and replaced with a short yield
return_val = false;
}
const auto [_, ok] = ppu->state.fetch_op([&](bs_t<cpu_flag>& val)
{
if (!(val & cpu_flag::signal))
{
val += cpu_flag::suspend;
// Flag used for forced timeout notification
ensure(!timeout || !(val & cpu_flag::notify));
return true;
}
@ -1316,8 +1341,8 @@ void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
if (!ok)
{
ppu_log.trace("sleep() failed (signaled) (%s)", ppu->current_function);
return;
ppu_log.fatal("sleep() failed (signaled) (%s)", ppu->current_function);
return false;
}
// Find and remove the thread
@ -1328,20 +1353,17 @@ void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
g_to_sleep.erase(it);
ppu->start_time = start_time;
on_to_sleep_update();
return true;
}
// Already sleeping
ppu_log.trace("sleep(): called on already sleeping thread.");
return;
}
if (std::exchange(ppu->ack_suspend, false))
{
g_pending--;
return false;
}
ppu->raddr = 0; // Clear reservation
ppu->start_time = start_time;
ppu->end_time = timeout ? start_time + std::min<u64>(timeout, ~start_time) : u64{umax};
}
else if (auto spu = thread.try_get<spu_thread>())
{
@ -1349,14 +1371,15 @@ void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
{
g_to_sleep.erase(it);
on_to_sleep_update();
return true;
}
return;
return false;
}
if (timeout)
{
const u64 wait_until = start_time + timeout;
const u64 wait_until = start_time + std::min<u64>(timeout, ~start_time);
// Register timeout if necessary
for (auto it = g_waiting.cbegin(), end = g_waiting.cend();; it++)
@ -1368,6 +1391,8 @@ void lv2_obj::sleep_unlocked(cpu_thread& thread, u64 timeout)
}
}
}
return return_val;
}
bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
@ -1403,59 +1428,37 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
if (ppu == cpu)
{
auto ppu2_next = &ppu->next_ppu;
auto ppu2 = ppu->next_ppu;
if (auto next = +*ppu2_next; !next || next->prio != ppu->prio)
{
return false;
}
for (;; i++)
{
const auto next = +*ppu2_next;
if (auto next2 = +next->next_ppu; !next2 || next2->prio != ppu->prio)
{
break;
}
ppu2_next = &next->next_ppu;
}
if (ppu2_next == &ppu->next_ppu)
if (!ppu2 || ppu2->prio != ppu->prio)
{
// Empty 'same prio' threads list
return false;
}
auto ppu2 = +*ppu2_next;
for (i++;; i++)
{
const auto next = ppu2->next_ppu;
if (!next || next->prio != ppu->prio)
{
break;
}
ppu2 = next;
}
// Rotate current thread to the last position of the 'same prio' threads list
*ppu_next = ppu2;
// Exchange forward pointers
if (ppu->next_ppu != ppu2)
{
auto ppu2_val = +ppu2->next_ppu;
ppu2->next_ppu = +ppu->next_ppu;
ppu->next_ppu = ppu2_val;
*ppu2_next = ppu;
}
else
{
auto ppu2_val = +ppu2->next_ppu;
ppu2->next_ppu = ppu;
ppu->next_ppu = ppu2_val;
}
*ppu_next = std::exchange(ppu->next_ppu, std::exchange(ppu2->next_ppu, ppu));
if (i <= g_cfg.core.ppu_threads + 0u)
if (i < g_cfg.core.ppu_threads + 0u)
{
// Threads were rotated, but no context switch was made
return false;
}
ppu->start_time = get_guest_system_time();
cpu = nullptr; // Disable current thread enqueing, also enable threads list enqueing
break;
}
@ -1479,6 +1482,13 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
if (next == cpu)
{
ppu_log.trace("sleep() - suspended (p=%zu)", g_pending);
if (static_cast<ppu_thread*>(cpu)->cancel_sleep == 1)
{
// The next sleep call of the thread is cancelled
static_cast<ppu_thread*>(cpu)->cancel_sleep = 2;
}
return false;
}
@ -1510,19 +1520,10 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
// Yield changed the queue before
bool changed_queue = prio == yield_cmd;
if (cpu)
if (cpu && prio != yield_cmd)
{
// Emplace current thread
if (!emplace_thread(cpu))
{
if (g_postpone_notify_barrier)
{
// This flag includes common optimizations regarding syscalls
// one of which is to allow a lock-free version of syscalls with awake behave as semaphore post: always notifies the thread, even if it hasn't slept yet
cpu->state += cpu_flag::signal;
}
}
else
if (emplace_thread(cpu))
{
changed_queue = true;
}
@ -1530,35 +1531,16 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
else for (const auto _cpu : g_to_awake)
{
// Emplace threads from list
if (!emplace_thread(_cpu))
{
if (g_postpone_notify_barrier)
{
_cpu->state += cpu_flag::signal;
}
}
else
if (emplace_thread(_cpu))
{
changed_queue = true;
}
}
// Remove pending if necessary
if (g_pending && ((cpu && cpu == get_current_cpu_thread()) || prio == yield_cmd))
{
if (auto cur = cpu_thread::get_current<ppu_thread>())
{
if (std::exchange(cur->ack_suspend, false))
{
g_pending--;
}
}
}
auto target = +g_ppu;
// Suspend threads if necessary
for (usz i = 0, thread_count = g_cfg.core.ppu_threads; changed_queue && target; target = target->next_ppu, i++)
for (usz i = 0, thread_count = g_cfg.core.ppu_threads; target; target = target->next_ppu, i++)
{
if (i >= thread_count && cpu_flag::suspend - target->state)
{
@ -1574,6 +1556,17 @@ bool lv2_obj::awake_unlocked(cpu_thread* cpu, s32 prio)
}
}
const auto current_ppu = cpu_thread::get_current<ppu_thread>();
// Remove pending if necessary
if (current_ppu)
{
if (std::exchange(current_ppu->ack_suspend, false))
{
ensure(g_pending)--;
}
}
return changed_queue;
}
@ -1585,12 +1578,12 @@ void lv2_obj::cleanup()
g_pending = 0;
}
void lv2_obj::schedule_all()
void lv2_obj::schedule_all(u64 current_time)
{
usz notify_later_idx = 0;
if (!g_pending && g_to_sleep.empty())
{
usz notify_later_idx = 0;
auto target = +g_ppu;
// Wake up threads
@ -1602,8 +1595,9 @@ void lv2_obj::schedule_all()
target->state ^= (cpu_flag::signal + cpu_flag::suspend);
target->start_time = 0;
if (notify_later_idx >= std::size(g_to_notify) - 1)
if (notify_later_idx == std::size(g_to_notify))
{
// Out of notification slots, notify locally (resizable container is not worth it)
target->state.notify_one(cpu_flag::signal + cpu_flag::suspend);
}
else
@ -1612,19 +1606,39 @@ void lv2_obj::schedule_all()
}
}
}
g_to_notify[notify_later_idx] = nullptr;
}
// Check registered timeouts
while (!g_waiting.empty())
{
auto& pair = g_waiting.front();
const auto pair = &g_waiting.front();
if (pair.first <= get_guest_system_time())
if (!current_time)
{
pair.second->notify();
current_time = get_guest_system_time();
}
if (pair->first <= current_time)
{
const auto target = pair->second;
g_waiting.pop_front();
if (target != cpu_thread::get_current())
{
// Change cpu_thread::state for the lightweight notification to work
ensure(!target->state.test_and_set(cpu_flag::notify));
// Otherwise notify it to wake itself
if (notify_later_idx == std::size(g_to_notify))
{
// Out of notification slots, notify locally (resizable container is not worth it)
target->state.notify_one(cpu_flag::notify);
}
else
{
g_to_notify[notify_later_idx++] = &target->state;
}
}
}
else
{
@ -1632,6 +1646,12 @@ void lv2_obj::schedule_all()
break;
}
}
if (notify_later_idx - 1 < std::size(g_to_notify) - 1)
{
// Null-terminate the list if it ends before last slot
g_to_notify[notify_later_idx] = nullptr;
}
}
ppu_thread_status lv2_obj::ppu_state(ppu_thread* ppu, bool lock_idm, bool lock_lv2)
@ -1715,3 +1735,109 @@ bool lv2_obj::has_ppus_in_running_state()
return false;
}
bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep)
{
static_assert(u64{umax} / max_timeout >= 100, "max timeout is not valid for scaling");
const u64 start_time = get_system_time();
if (cpu)
{
if (u64 end_time = cpu->end_time; end_time != umax)
{
const u64 guest_start = get_guest_system_time(start_time);
if (end_time <= guest_start)
{
return true;
}
usec = end_time - guest_start;
scale = true;
}
}
if (scale)
{
// Scale time
usec = std::min<u64>(usec, u64{umax} / 100) * 100 / g_cfg.core.clocks_scale;
}
// Clamp
usec = std::min<u64>(usec, max_timeout);
u64 passed = 0;
atomic_bs_t<cpu_flag> dummy{};
const auto& state = cpu ? cpu->state : dummy;
auto old_state = +state;
auto wait_for = [&](u64 timeout)
{
thread_ctrl::wait_on(state, old_state, timeout);
};
for (;; old_state = state)
{
if (old_state & cpu_flag::notify)
{
// Timeout notification has been forced
break;
}
if (old_state & cpu_flag::signal)
{
return false;
}
if (::is_stopped(old_state) || thread_ctrl::state() == thread_state::aborting)
{
return passed >= usec;
}
if (passed >= usec)
{
break;
}
u64 remaining = usec - passed;
#ifdef __linux__
// NOTE: Assumption that timer initialization has succeeded
u64 host_min_quantum = is_usleep && remaining <= 1000 ? 10 : 50;
#else
// Host scheduler quantum for windows (worst case)
// NOTE: On ps3 this function has very high accuracy
constexpr u64 host_min_quantum = 500;
#endif
// TODO: Tune for other non windows operating sytems
if (g_cfg.core.sleep_timers_accuracy < (is_usleep ? sleep_timers_accuracy_level::_usleep : sleep_timers_accuracy_level::_all_timers))
{
wait_for(remaining);
}
else
{
if (remaining > host_min_quantum)
{
#ifdef __linux__
// Do not wait for the last quantum to avoid loss of accuracy
wait_for(remaining - ((remaining % host_min_quantum) + host_min_quantum));
#else
// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
wait_for(remaining - (remaining % host_min_quantum));
#endif
}
// TODO: Determine best value for yield delay
else
{
// Try yielding. May cause long wake latency but helps weaker CPUs a lot by alleviating resource pressure
std::this_thread::yield();
}
}
passed = get_system_time() - start_time;
}
return true;
}

View File

@ -162,6 +162,8 @@ error_code _sys_lwmutex_lock(ppu_thread& ppu, u32 lwmutex_id, u64 timeout)
lv2_obj::prepare_for_sleep(ppu);
ppu.cancel_sleep = 1;
if (s32 signal = mutex.try_own(&ppu))
{
if (signal == smin)
@ -169,12 +171,13 @@ error_code _sys_lwmutex_lock(ppu_thread& ppu, u32 lwmutex_id, u64 timeout)
ppu.gpr[3] = CELL_EBUSY;
}
ppu.cancel_sleep = 0;
return true;
}
mutex.sleep(ppu, timeout);
const bool finished = !mutex.sleep(ppu, timeout);
notify.cleanup();
return false;
return finished;
});
if (!mutex)

View File

@ -135,7 +135,7 @@ struct lv2_lwmutex final : lv2_obj
control_data_t store = old;
store.signaled |= (unlock2 ? s32{smin} : 1);
if (lv2_control.compare_and_swap_test(old, store))
if (lv2_control.compare_exchange(old, store))
{
return true;
}

View File

@ -162,15 +162,19 @@ error_code sys_mutex_lock(ppu_thread& ppu, u32 mutex_id, u64 timeout)
{
lv2_obj::prepare_for_sleep(ppu);
if (mutex.try_own(ppu))
ppu.cancel_sleep = 1;
if (mutex.try_own(ppu) || !mutex.sleep(ppu, timeout))
{
result = {};
}
else
if (ppu.cancel_sleep != 1)
{
mutex.sleep(ppu, timeout);
notify.cleanup();
}
ppu.cancel_sleep = 0;
}
return result;

View File

@ -225,7 +225,7 @@ public:
private:
// Remove the current thread from the scheduling queue, register timeout
static void sleep_unlocked(cpu_thread&, u64 timeout);
static bool sleep_unlocked(cpu_thread&, u64 timeout, u64 current_time);
// Schedule the thread
static bool awake_unlocked(cpu_thread*, s32 prio = enqueue_cmd);
@ -233,7 +233,7 @@ private:
public:
static constexpr u64 max_timeout = u64{umax} / 1000;
static void sleep(cpu_thread& cpu, const u64 timeout = 0);
static bool sleep(cpu_thread& cpu, const u64 timeout = 0);
static bool awake(cpu_thread* thread, s32 prio = enqueue_cmd);
@ -406,95 +406,7 @@ public:
return make;
}
template <bool IsUsleep = false, bool Scale = true>
static bool wait_timeout(u64 usec, cpu_thread* const cpu = {})
{
static_assert(u64{umax} / max_timeout >= 100, "max timeout is not valid for scaling");
if constexpr (Scale)
{
// Scale time
usec = std::min<u64>(usec, u64{umax} / 100) * 100 / g_cfg.core.clocks_scale;
}
// Clamp
usec = std::min<u64>(usec, max_timeout);
u64 passed = 0;
const u64 start_time = get_system_time();
auto wait_for = [cpu](u64 timeout)
{
atomic_bs_t<cpu_flag> dummy{};
auto& state = cpu ? cpu->state : dummy;
const auto old = +state;
if (old & cpu_flag::signal)
{
return true;
}
thread_ctrl::wait_on(state, old, timeout);
return false;
};
while (usec >= passed)
{
u64 remaining = usec - passed;
#ifdef __linux__
// NOTE: Assumption that timer initialization has succeeded
u64 host_min_quantum = IsUsleep && remaining <= 1000 ? 10 : 50;
#else
// Host scheduler quantum for windows (worst case)
// NOTE: On ps3 this function has very high accuracy
constexpr u64 host_min_quantum = 500;
#endif
// TODO: Tune for other non windows operating sytems
bool escape = false;
if (g_cfg.core.sleep_timers_accuracy < (IsUsleep ? sleep_timers_accuracy_level::_usleep : sleep_timers_accuracy_level::_all_timers))
{
escape = wait_for(remaining);
}
else
{
if (remaining > host_min_quantum)
{
#ifdef __linux__
// Do not wait for the last quantum to avoid loss of accuracy
escape = wait_for(remaining - ((remaining % host_min_quantum) + host_min_quantum));
#else
// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
escape = wait_for(remaining - (remaining % host_min_quantum));
#endif
}
else
{
// Try yielding. May cause long wake latency but helps weaker CPUs a lot by alleviating resource pressure
std::this_thread::yield();
}
}
if (auto cpu0 = get_current_cpu_thread(); cpu0 && cpu0->is_stopped())
{
return false;
}
if (thread_ctrl::state() == thread_state::aborting)
{
return false;
}
if (escape)
{
return false;
}
passed = get_system_time() - start_time;
}
return true;
}
static bool wait_timeout(u64 usec, ppu_thread* cpu = {}, bool scale = true, bool is_usleep = false);
static inline void notify_all()
{
@ -502,9 +414,7 @@ public:
{
if (!cpu)
{
g_to_notify[0] = nullptr;
g_postpone_notify_barrier = false;
return;
break;
}
if (cpu != &g_to_notify)
@ -514,6 +424,9 @@ public:
atomic_wait_engine::notify_one(cpu, 4, atomic_wait::default_mask<atomic_bs_t<cpu_flag>>);
}
}
g_to_notify[0] = nullptr;
g_postpone_notify_barrier = false;
}
// Can be called before the actual sleep call in order to move it out of mutex scope
@ -542,7 +455,8 @@ public:
}
// While IDM mutex is still locked (this function assumes so) check if the notification is still needed
if (cpu != &g_to_notify && !static_cast<const decltype(cpu_thread::state)*>(cpu)->all_of(cpu_flag::signal + cpu_flag::wait))
// Pending flag is meant for forced notification (if the CPU really has pending work it can restore the flag in theory)
if (cpu != &g_to_notify && static_cast<const decltype(cpu_thread::state)*>(cpu)->none_of(cpu_flag::signal + cpu_flag::pending))
{
// Omit it (this is a void pointer, it can hold anything)
cpu = &g_to_notify;
@ -575,5 +489,5 @@ private:
// If a notify_all_t object exists locally, postpone notifications to the destructor of it (not recursive, notifies on the first destructor for safety)
static thread_local bool g_postpone_notify_barrier;
static void schedule_all();
static void schedule_all(u64 current_time = 0);
};

View File

@ -409,7 +409,7 @@ error_code sys_timer_usleep(ppu_thread& ppu, u64 sleep_time)
{
lv2_obj::sleep(ppu, g_cfg.core.sleep_timers_accuracy < sleep_timers_accuracy_level::_usleep ? sleep_time : 0);
if (!lv2_obj::wait_timeout<true>(sleep_time))
if (!lv2_obj::wait_timeout(sleep_time, &ppu, true, true))
{
ppu.state += cpu_flag::again;
}

View File

@ -3404,7 +3404,7 @@ namespace rsx
if (target_rsx_flip_time > time + 1000)
{
const auto delay_us = target_rsx_flip_time - time;
lv2_obj::wait_timeout<false, false>(delay_us);
lv2_obj::wait_timeout(delay_us, nullptr, false);
performance_counters.idle_time += delay_us;
}
}