Merge remote-tracking branch 'JoelLinn/fix-cpu-clock' into canary

This commit is contained in:
illusion98 2019-11-24 09:47:59 -05:00
parent feb4f0c2a5
commit 03e926605a
8 changed files with 337 additions and 72 deletions

View File

@ -10,50 +10,135 @@
#include "xenia/base/clock.h"
#include <algorithm>
#include <climits>
#include <limits>
#include <mutex>
#include "xenia/base/assert.h"
#include "xenia/base/math.h"
DEFINE_bool(clock_no_scaling, false,
"Disable scaling code. Time management and locking is bypassed. "
"Guest system time is directly pulled from host.",
"CPU");
DEFINE_bool(clock_source_raw, false,
"Use the RDTSC instruction as the time source. "
"Host CPU must support invariant TSC. ",
"CPU");
namespace xe {
// Time scalar applied to all time operations.
double guest_time_scalar_ = 1.0;
// Tick frequency of guest.
uint64_t guest_tick_frequency_ = Clock::host_tick_frequency();
uint64_t guest_tick_frequency_ = Clock::host_tick_frequency_platform();
// Base FILETIME of the guest system from app start.
uint64_t guest_system_time_base_ = Clock::QueryHostSystemTime();
// Combined time and frequency scalar (computed by RecomputeGuestTickScalar).
double guest_tick_scalar_ = 1.0;
// Combined time and frequency ratio between host and guest.
// Split in numerator (first) and denominator (second).
// Computed by RecomputeGuestTickScalar.
std::pair<uint64_t, uint64_t> guest_tick_ratio_ = std::make_pair(1, 1);
// Native guest ticks.
thread_local uint64_t guest_tick_count_ = 0;
// 100ns ticks, relative to guest_system_time_base_.
thread_local uint64_t guest_time_filetime_ = 0;
uint64_t last_guest_tick_count_ = 0;
// Last sampled host tick count.
thread_local uint64_t last_host_tick_count_ = Clock::QueryHostTickCount();
uint64_t last_host_tick_count_ = Clock::QueryHostTickCount();
// Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync
std::mutex tick_mutex_;
void RecomputeGuestTickScalar() {
guest_tick_scalar_ = (guest_tick_frequency_ * guest_time_scalar_) /
static_cast<double>(Clock::host_tick_frequency());
// Create a rational number with numerator (first) and denominator (second)
auto frac =
std::make_pair(guest_tick_frequency_, Clock::QueryHostTickFrequency());
// Doing it this way ensures we don't mess up our frequency scaling and
// precisely controls the precision the guest_time_scalar_ can have.
if (guest_time_scalar_ > 1.0) {
frac.first *= static_cast<uint64_t>(guest_time_scalar_ * 10.0);
frac.second *= 10;
} else {
frac.first *= 10;
frac.second *= static_cast<uint64_t>(10.0 / guest_time_scalar_);
}
// Keep this a rational calculation and reduce the fraction
reduce_fraction(frac);
std::lock_guard<std::mutex> lock(tick_mutex_);
guest_tick_ratio_ = frac;
}
void UpdateGuestClock() {
// Update the guest timer for all threads.
// Return a copy of the value so locking is reduced.
uint64_t UpdateGuestClock() {
uint64_t host_tick_count = Clock::QueryHostTickCount();
uint64_t host_tick_delta = host_tick_count > last_host_tick_count_
? host_tick_count - last_host_tick_count_
: 0;
last_host_tick_count_ = host_tick_count;
uint64_t guest_tick_delta = uint64_t(host_tick_delta * guest_tick_scalar_);
guest_tick_count_ += guest_tick_delta;
guest_time_filetime_ += (guest_tick_delta * 10000000) / guest_tick_frequency_;
if (cvars::clock_no_scaling) {
// Nothing to update, calculate on the fly
return host_tick_count * guest_tick_ratio_.first / guest_tick_ratio_.second;
}
std::unique_lock<std::mutex> lock(tick_mutex_, std::defer_lock);
if (lock.try_lock()) {
// Translate host tick count to guest tick count.
uint64_t host_tick_delta = host_tick_count > last_host_tick_count_
? host_tick_count - last_host_tick_count_
: 0;
last_host_tick_count_ = host_tick_count;
uint64_t guest_tick_delta =
host_tick_delta * guest_tick_ratio_.first / guest_tick_ratio_.second;
last_guest_tick_count_ += guest_tick_delta;
return last_guest_tick_count_;
} else {
// Wait until another thread has finished updating the clock.
lock.lock();
return last_guest_tick_count_;
}
}
// Offset of the current guest system file time relative to the guest base time.
inline uint64_t QueryGuestSystemTimeOffset() {
if (cvars::clock_no_scaling) {
return Clock::QueryHostSystemTime() - guest_system_time_base_;
}
auto guest_tick_count = UpdateGuestClock();
uint64_t numerator = 10000000; // 100ns/10MHz resolution
uint64_t denominator = guest_tick_frequency_;
reduce_fraction(numerator, denominator);
return guest_tick_count * numerator / denominator;
}
uint64_t Clock::QueryHostTickFrequency() {
if (cvars::clock_source_raw) {
return host_tick_frequency_raw();
} else {
return host_tick_frequency_platform();
}
}
uint64_t Clock::QueryHostTickCount() {
if (cvars::clock_source_raw) {
return host_tick_count_raw();
} else {
return host_tick_count_platform();
}
}
double Clock::guest_time_scalar() { return guest_time_scalar_; }
void Clock::set_guest_time_scalar(double scalar) {
if (cvars::clock_no_scaling) {
return;
}
guest_time_scalar_ = scalar;
RecomputeGuestTickScalar();
}
std::pair<uint64_t, uint64_t> Clock::guest_tick_ratio() {
std::lock_guard<std::mutex> lock(tick_mutex_);
return guest_tick_ratio_;
}
uint64_t Clock::guest_tick_frequency() { return guest_tick_frequency_; }
void Clock::set_guest_tick_frequency(uint64_t frequency) {
@ -68,43 +153,58 @@ void Clock::set_guest_system_time_base(uint64_t time_base) {
}
uint64_t Clock::QueryGuestTickCount() {
UpdateGuestClock();
return guest_tick_count_;
auto guest_tick_count = UpdateGuestClock();
return guest_tick_count;
}
uint64_t Clock::QueryGuestSystemTime() {
UpdateGuestClock();
return guest_system_time_base_ + guest_time_filetime_;
if (cvars::clock_no_scaling) {
return Clock::QueryHostSystemTime();
}
auto guest_system_time_offset = QueryGuestSystemTimeOffset();
return guest_system_time_base_ + guest_system_time_offset;
}
uint32_t Clock::QueryGuestUptimeMillis() {
UpdateGuestClock();
uint64_t uptime_millis = guest_tick_count_ / (guest_tick_frequency_ / 1000);
uint32_t result = uint32_t(std::min(uptime_millis, uint64_t(UINT_MAX)));
return result;
}
void Clock::SetGuestTickCount(uint64_t tick_count) {
last_host_tick_count_ = Clock::QueryHostTickCount();
guest_tick_count_ = tick_count;
return static_cast<uint32_t>(
std::min<uint64_t>(QueryGuestSystemTimeOffset() / 10000,
std::numeric_limits<uint32_t>::max()));
}
void Clock::SetGuestSystemTime(uint64_t system_time) {
last_host_tick_count_ = Clock::QueryHostTickCount();
guest_time_filetime_ = system_time - guest_system_time_base_;
if (cvars::clock_no_scaling) {
// Time is fixed to host time.
return;
}
// Query the filetime offset to calculate a new base time.
auto guest_system_time_offset = QueryGuestSystemTimeOffset();
guest_system_time_base_ = system_time - guest_system_time_offset;
}
uint32_t Clock::ScaleGuestDurationMillis(uint32_t guest_ms) {
if (guest_ms == UINT_MAX) {
return UINT_MAX;
if (cvars::clock_no_scaling) {
return guest_ms;
}
constexpr uint64_t max = std::numeric_limits<uint32_t>::max();
if (guest_ms >= max) {
return max;
} else if (!guest_ms) {
return 0;
}
uint64_t scaled_ms = uint64_t(uint64_t(guest_ms) * guest_time_scalar_);
return uint32_t(std::min(scaled_ms, uint64_t(UINT_MAX)));
uint64_t scaled_ms = static_cast<uint64_t>(
(static_cast<uint64_t>(guest_ms) * guest_time_scalar_));
return static_cast<uint32_t>(std::min(scaled_ms, max));
}
int64_t Clock::ScaleGuestDurationFileTime(int64_t guest_file_time) {
if (cvars::clock_no_scaling) {
return static_cast<uint64_t>(guest_file_time);
}
if (!guest_file_time) {
return 0;
} else if (guest_file_time > 0) {
@ -116,17 +216,23 @@ int64_t Clock::ScaleGuestDurationFileTime(int64_t guest_file_time) {
return static_cast<int64_t>(guest_time) + scaled_time;
} else {
// Relative time.
uint64_t scaled_file_time =
uint64_t(uint64_t(guest_file_time) * guest_time_scalar_);
uint64_t scaled_file_time = static_cast<uint64_t>(
(static_cast<uint64_t>(guest_file_time) * guest_time_scalar_));
// TODO(benvanik): check for overflow?
return scaled_file_time;
}
}
void Clock::ScaleGuestDurationTimeval(int32_t* tv_sec, int32_t* tv_usec) {
uint64_t scaled_sec = uint64_t(uint64_t(*tv_sec) * guest_tick_scalar_);
uint64_t scaled_usec = uint64_t(uint64_t(*tv_usec) * guest_time_scalar_);
if (scaled_usec > UINT_MAX) {
if (cvars::clock_no_scaling) {
return;
}
uint64_t scaled_sec = static_cast<uint64_t>(static_cast<uint64_t>(*tv_sec) *
guest_time_scalar_);
uint64_t scaled_usec = static_cast<uint64_t>(static_cast<uint64_t>(*tv_usec) *
guest_time_scalar_);
if (scaled_usec > std::numeric_limits<uint32_t>::max()) {
uint64_t overflow_sec = scaled_usec / 1000000;
scaled_usec -= overflow_sec * 1000000;
scaled_sec += overflow_sec;

View File

@ -12,12 +12,25 @@
#include <cstdint>
#include "xenia/base/cvar.h"
DECLARE_bool(clock_no_scaling);
DECLARE_bool(clock_source_raw);
namespace xe {
class Clock {
public:
// Host ticks-per-second.
static uint64_t host_tick_frequency();
// Host ticks-per-second. Generally QueryHostTickFrequency should be used.
// Either from platform suplied time source or from hardware directly.
static uint64_t host_tick_frequency_platform();
static uint64_t host_tick_frequency_raw();
// Host tick count. Generally QueryHostTickCount() should be used.
static uint64_t host_tick_count_platform();
static uint64_t host_tick_count_raw();
// Queries the host tick frequency.
static uint64_t QueryHostTickFrequency();
// Queries the current host tick count.
static uint64_t QueryHostTickCount();
// Host time, in FILETIME format.
@ -30,6 +43,8 @@ class Clock {
// Sets the guest time scalar, adjusting tick and wall clock speed.
// Ex: 1x=normal, 2x=double speed, 1/2x=half speed.
static void set_guest_time_scalar(double scalar);
// Get the tick ration between host and guest including time scaling if set.
static std::pair<uint64_t, uint64_t> guest_tick_ratio();
// Guest ticks-per-second.
static uint64_t guest_tick_frequency();
// Sets the guest ticks-per-second.
@ -39,6 +54,7 @@ class Clock {
// Sets the guest time base, used for computing the system time.
// By default this is the current system time.
static void set_guest_system_time_base(uint64_t time_base);
// Queries the current guest tick count, accounting for frequency adjustment
// and scaling.
static uint64_t QueryGuestTickCount();
@ -47,9 +63,7 @@ class Clock {
// Queries the milliseconds since the guest began, accounting for scaling.
static uint32_t QueryGuestUptimeMillis();
// Sets the guest tick count for the current thread.
static void SetGuestTickCount(uint64_t tick_count);
// Sets the system time for the current thread.
// Sets the system time of the guest.
static void SetGuestSystemTime(uint64_t system_time);
// Scales a time duration in milliseconds, from guest time.

View File

@ -14,14 +14,14 @@
namespace xe {
uint64_t Clock::host_tick_frequency() {
uint64_t Clock::host_tick_frequency_platform() {
timespec res;
clock_getres(CLOCK_MONOTONIC_RAW, &res);
return uint64_t(res.tv_sec) + uint64_t(res.tv_nsec) * 1000000000ull;
}
uint64_t Clock::QueryHostTickCount() {
uint64_t Clock::host_tick_count_platform() {
timespec res;
clock_gettime(CLOCK_MONOTONIC_RAW, &res);
@ -40,7 +40,7 @@ uint64_t Clock::QueryHostSystemTime() {
}
uint64_t Clock::QueryHostUptimeMillis() {
return QueryHostTickCount() / (host_tick_frequency() / 1000);
return host_tick_count_platform() * 1000 / host_tick_frequency_platform();
}
} // namespace xe

View File

@ -13,15 +13,13 @@
namespace xe {
uint64_t Clock::host_tick_frequency() {
static LARGE_INTEGER frequency = {{0}};
if (!frequency.QuadPart) {
QueryPerformanceFrequency(&frequency);
}
uint64_t Clock::host_tick_frequency_platform() {
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
return frequency.QuadPart;
}
uint64_t Clock::QueryHostTickCount() {
uint64_t Clock::host_tick_count_platform() {
LARGE_INTEGER counter;
uint64_t time = 0;
if (QueryPerformanceCounter(&counter)) {
@ -37,7 +35,7 @@ uint64_t Clock::QueryHostSystemTime() {
}
uint64_t Clock::QueryHostUptimeMillis() {
return QueryHostTickCount() / (host_tick_frequency() / 1000);
return host_tick_count_platform() * 1000 / host_tick_frequency_platform();
}
} // namespace xe

105
src/xenia/base/clock_x64.cc Normal file
View File

@ -0,0 +1,105 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2019 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include "xenia/base/platform.h"
#if XE_ARCH_AMD64
#include "xenia/base/clock.h"
#include "xenia/base/logging.h"
// Wrap all these different cpu compiler intrinsics.
// So no inline assembler here and the compiler will remove the clutter.
#if XE_COMPILER_MSVC
#define xe_cpu_cpuid(level, eax, ebx, ecx, edx) \
{ \
int __xe_cpuid_registers_[4]; \
__cpuid(__xe_cpuid_registers_, (level)); \
(eax) = static_cast<uint32_t>(__xe_cpuid_registers_[0]); \
(ebx) = static_cast<uint32_t>(__xe_cpuid_registers_[1]); \
(ecx) = static_cast<uint32_t>(__xe_cpuid_registers_[2]); \
(edx) = static_cast<uint32_t>(__xe_cpuid_registers_[3]); \
}
#define xe_cpu_rdtsc() __rdtsc()
#elif XE_COMPILER_CLANG || XE_COMPILER_GNUC
#include <cpuid.h>
#define xe_cpu_cpuid(level, eax, ebx, ecx, edx) \
__cpuid((level), (eax), (ebx), (ecx), (edx));
#define xe_cpu_rdtsc() __rdtsc()
#else
#error "No cpu instruction wrappers for current compiler implemented."
#endif
#define CLOCK_FATAL(msg) \
xe::FatalError( \
"The raw clock source is not supported on your CPU. \n" \
"%s \n" \
"Set the cvar 'clock_source_raw' to 'false'.", \
(msg));
namespace xe {
// Getting the TSC frequency can be a bit tricky. This method here only works on
// Intel as it seems. There is no easy way to get the frequency outside of ring0
// on AMD, so we fail gracefully if not possible.
uint64_t Clock::host_tick_frequency_raw() {
uint32_t eax, ebx, ecx, edx;
// 00H Get max supported cpuid level.
xe_cpu_cpuid(0x0, eax, ebx, ecx, edx);
auto max_cpuid = eax;
// 80000000H Get max extended cpuid level
xe_cpu_cpuid(0x80000000, eax, ebx, ecx, edx);
auto max_cpuid_ex = eax;
// 80000007H Get extended power feature info
if (max_cpuid_ex >= 0x80000007) {
xe_cpu_cpuid(0x80000007, eax, ebx, ecx, edx);
// Invariant TSC bit at position 8
auto tsc_invariant = edx & (1 << 8);
// If the TSC is not invariant it will change its frequency with power
// states and across cores.
if (!tsc_invariant) {
CLOCK_FATAL("The CPU has no invariant TSC.");
return 0;
}
} else {
CLOCK_FATAL("Unclear if the CPU has an invariant TSC.")
return 0;
}
if (max_cpuid >= 0x15) {
// 15H Get TSC/Crystal ratio and Crystal Hz.
xe_cpu_cpuid(0x15, eax, ebx, ecx, edx);
uint64_t ratio_num = ebx;
uint64_t ratio_den = eax;
uint64_t cryst_freq = ecx;
// For some CPUs, Crystal frequency is not reported.
if (ratio_num && ratio_den && cryst_freq) {
// If it is, calculate the TSC frequency
auto tsc_freq = cryst_freq * ratio_num / ratio_den;
}
}
if (max_cpuid >= 0x16) {
// 16H Get CPU base frequency MHz in EAX.
xe_cpu_cpuid(0x16, eax, ebx, ecx, edx);
uint64_t cpu_base_freq = static_cast<uint64_t>(eax) * 1000000;
assert(cpu_base_freq);
return cpu_base_freq;
}
CLOCK_FATAL("The clock frequency could not be determined.");
return 0;
}
uint64_t Clock::host_tick_count_raw() { return xe_cpu_rdtsc(); }
} // namespace xe
#endif

View File

@ -15,6 +15,7 @@
#include <cstdint>
#include <cstring>
#include <limits>
#include <numeric>
#include <type_traits>
#include "xenia/base/platform.h"
@ -59,6 +60,34 @@ T next_pow2(T value) {
return value;
}
#if __cpp_lib_gcd_lcm
template <typename T>
inline constexpr T greatest_common_divisor(T a, T b) {
return std::gcd(a, b);
}
#else
template <typename T>
constexpr T greatest_common_divisor(T a, T b) {
// Use the Euclid algorithm to calculate the greatest common divisor
while (b) {
a = std::exchange(b, a % b);
}
return a;
}
#endif
template <typename T>
inline constexpr void reduce_fraction(T& numerator, T& denominator) {
auto gcd = greatest_common_divisor(numerator, denominator);
numerator /= gcd;
denominator /= gcd;
}
template <typename T>
inline constexpr void reduce_fraction(std::pair<T, T>& fraction) {
reduce_fraction<T>(fraction.first, fraction.second);
}
constexpr uint32_t make_bitmask(uint32_t a, uint32_t b) {
return (static_cast<uint32_t>(-1) >> (31 - b)) & ~((1u << a) - 1);
}

View File

@ -440,9 +440,34 @@ EMITTER_OPCODE_TABLE(OPCODE_ROUND, ROUND_F32, ROUND_F64, ROUND_V128);
// ============================================================================
struct LOAD_CLOCK : Sequence<LOAD_CLOCK, I<OPCODE_LOAD_CLOCK, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// It'd be cool to call QueryPerformanceCounter directly, but w/e.
e.CallNative(LoadClock);
e.mov(i.dest, e.rax);
// When scaling is disabled and the raw clock source is selected, the code
// in the Clock class is actually just forwarding tick counts after one
// simple multiply and division. In that case we rather bake the scaling in
// here to cut extra function calls with CPU cache misses and stack frame
// overhead.
if (cvars::clock_no_scaling && cvars::clock_source_raw) {
auto ratio = Clock::guest_tick_ratio();
// The 360 CPU is an in-order CPU, AMD64 usually isn't. Without
// mfence/lfence magic the rdtsc instruction can be executed sooner or
// later in the cache window. Since it's resolution however is much higher
// than the 360's mftb instruction this can safely be ignored.
// Read time stamp in edx (high part) and eax (low part).
e.rdtsc();
// Make it a 64 bit number in rax.
e.shl(e.rdx, 32);
e.or_(e.rax, e.rdx);
// Apply tick frequency scaling.
e.mov(e.rcx, ratio.first);
e.mul(e.rcx);
// We actually now have a 128 bit number in rdx:rax.
e.mov(e.rcx, ratio.second);
e.div(e.rcx);
e.mov(i.dest, e.rax);
} else {
e.CallNative(LoadClock);
e.mov(i.dest, e.rax);
}
}
static uint64_t LoadClock(void* raw_context) {
return Clock::QueryGuestTickCount();

View File

@ -823,10 +823,6 @@ struct ThreadSavedState {
bool is_main_thread; // Is this the main thread?
bool is_running;
// Clock settings (invalid if not running)
uint64_t tick_count_;
uint64_t system_time_;
uint32_t apc_head;
uint32_t tls_static_address;
uint32_t tls_dynamic_address;
@ -895,10 +891,6 @@ bool XThread::Save(ByteStream* stream) {
state.stack_alloc_size = stack_alloc_size_;
if (running_) {
state.tick_count_ = Clock::QueryGuestTickCount();
state.system_time_ =
Clock::QueryGuestSystemTime() - Clock::guest_system_time_base();
// Context information
auto context = thread_state_->context();
state.context.lr = context->lr;
@ -1008,10 +1000,6 @@ object_ref<XThread> XThread::Restore(KernelState* kernel_state,
// Profiler needs to know about the thread.
xe::Profiler::ThreadEnter(thread->name().c_str());
// Setup the time now that we're in the thread.
Clock::SetGuestTickCount(state.tick_count_);
Clock::SetGuestSystemTime(state.system_time_);
current_xthread_tls_ = thread;
current_thread_ = thread;