LV2: Improve sys_timer_usleep by using CPU usermode waiting

* Linux: set timerslack to minimum value
- Linux delays the wakeup of threads to save power, this feature isn't needed for this application

* Utils: Add detection for waitpkg and monitorx extensions
- These instructions are used for user mode wait instructions

* lv2: Use user mode wait instructions instead of yielding when appropriate
This commit is contained in:
Whatcookie 2023-08-05 04:49:30 -04:00 committed by GitHub
parent aee97e414f
commit d4cf12bc17
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 101 additions and 1 deletions

View File

@ -53,6 +53,17 @@
#include <optional>
#include <deque>
#include "util/tsc.hpp"
#include "util/sysinfo.hpp"
#if defined(ARCH_X64)
#ifdef _MSC_VER
#include <intrin.h>
#include <immintrin.h>
#else
#include <x86intrin.h>
#endif
#endif
extern std::string ppu_get_syscall_name(u64 code);
@ -1880,6 +1891,35 @@ void lv2_obj::set_yield_frequency(u64 freq, u64 max_allowed_tsc)
g_lv2_preempts_taken.release(0);
}
#if defined(_MSC_VER)
#define mwaitx_func
#define waitpkg_func
#else
#define mwaitx_func __attribute__((__target__("mwaitx")))
#define waitpkg_func __attribute__((__target__("waitpkg")))
#endif
#if defined(ARCH_X64)
// Waits for a number of TSC clock cycles in power optimized state
// Cstate is represented in bits [7:4]+1 cstate. So C0 requires bits [7:4] to be set to 0xf, C1 requires bits [7:4] to be set to 0.
mwaitx_func static void __mwaitx(u32 cycles, u32 cstate)
{
constexpr u32 timer_enable = 0x2;
// monitorx will wake if the cache line is written to. We don't want this, so place the monitor value on it's own cache line.
alignas(64) u64 monitor_var{};
_mm_monitorx(&monitor_var, 0, 0);
_mm_mwaitx(timer_enable, cstate, cycles);
}
// First bit indicates cstate, 0x0 for C.02 state (lower power) or 0x1 for C.01 state (higher power)
waitpkg_func static void __tpause(u32 cycles, u32 cstate)
{
const u64 tsc = utils::get_tsc() + cycles;
_tpause(cstate, tsc);
}
#endif
bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep)
{
static_assert(u64{umax} / max_timeout >= 100, "max timeout is not valid for scaling");
@ -1965,6 +2005,7 @@ bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep
if (remaining > host_min_quantum)
{
#ifdef __linux__
// With timerslack set low, Linux is precise for all values above
wait_for(remaining);
#else
// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
@ -1972,6 +2013,21 @@ bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep
#endif
}
// TODO: Determine best value for yield delay
#if defined(ARCH_X64)
else if (utils::has_appropriate_um_wait())
{
u32 us_in_tsc_clocks = remaining * (utils::get_tsc_freq() / 1000000);
if (utils::has_waitpkg())
{
__tpause(us_in_tsc_clocks, 0x1);
}
else
{
__mwaitx(us_in_tsc_clocks, 0xf0);
}
}
#endif
else
{
// Try yielding. May cause long wake latency but helps weaker CPUs a lot by alleviating resource pressure

View File

@ -1049,6 +1049,11 @@ int main(int argc, char** argv)
}
}
// Set timerslack value for Linux. The default value is 50,000ns. Change this to just 1 since we value precise timers.
#ifdef __linux__
prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0);
#endif
#ifdef _WIN32
// Create dummy permanent low resolution timer to workaround messing with system timer resolution
QTimer* dummy_timer = new QTimer(app.data());

View File

@ -298,7 +298,7 @@ bool utils::has_fma4()
bool utils::has_fast_vperm2b()
{
#if defined(ARCH_X64)
static const bool g_value = has_avx512() && (get_cpuid(7, 0)[2] & 0x2) == 0x2 && get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x20) == 0x20;
static const bool g_value = has_avx512() && (get_cpuid(7, 0)[2] & 0x2) == 0x2 && get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x40) == 0x40;
return g_value;
#else
return false;
@ -325,6 +325,39 @@ bool utils::has_fsrm()
#endif
}
bool utils::has_waitx()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x20000000) == 0x20000000;
return g_value;
#else
return false;
#endif
}
bool utils::has_waitpkg()
{
#if defined(ARCH_X64)
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[2] & 0x20) == 0x20;
return g_value;
#else
return false;
#endif
}
// User mode waits may be unfriendly to low thread CPUs
// Filter out systems with less than 8 threads for linux and less than 12 threads for other platforms
bool utils::has_appropriate_um_wait()
{
#ifdef __linux__
static const bool g_value = (has_waitx() || has_waitpkg()) && (get_thread_count() >= 8) && get_tsc_freq();
return g_value;
#else
static const bool g_value = (has_waitx() || has_waitpkg()) && (get_thread_count() >= 12) && get_tsc_freq();
return g_value;
#endif
}
u32 utils::get_rep_movsb_threshold()
{
static const u32 g_value = []()

View File

@ -53,6 +53,12 @@ namespace utils
bool has_fsrm();
bool has_waitx();
bool has_waitpkg();
bool has_appropriate_um_wait();
std::string get_cpu_brand();
std::string get_system_info();