[CPU] Implement mftb instruction natively.

When the cvars clock_no_scaling and clock_source_raw are set, tick counts will be directly calculated in the emitted code.
2019-11-22 03:42:15 +01:00 · 2019-11-22 03:42:15 +01:00 · 922f1f220a
parent 15d422d988
commit 922f1f220a
3 changed files with 36 additions and 4 deletions
--- a/src/xenia/base/clock.cc
+++ b/src/xenia/base/clock.cc
@ -134,6 +134,11 @@ void Clock::set_guest_time_scalar(double scalar) {
  RecomputeGuestTickScalar();
 }

+std::pair<uint64_t, uint64_t> Clock::guest_tick_ratio() {
+  std::lock_guard<std::mutex> lock(tick_mutex_);
+  return guest_tick_ratio_;
+}
+
 uint64_t Clock::guest_tick_frequency() { return guest_tick_frequency_; }

 void Clock::set_guest_tick_frequency(uint64_t frequency) {
--- a/src/xenia/base/clock.h
+++ b/src/xenia/base/clock.h
@ -43,6 +43,8 @@ class Clock {
  // Sets the guest time scalar, adjusting tick and wall clock speed.
  // Ex: 1x=normal, 2x=double speed, 1/2x=half speed.
  static void set_guest_time_scalar(double scalar);
+  // Get the tick ration between host and guest including time scaling if set.
+  static std::pair<uint64_t, uint64_t> guest_tick_ratio();
  // Guest ticks-per-second.
  static uint64_t guest_tick_frequency();
  // Sets the guest ticks-per-second.
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Copyright 2019 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -440,9 +440,34 @@ EMITTER_OPCODE_TABLE(OPCODE_ROUND, ROUND_F32, ROUND_F64, ROUND_V128);
 // ============================================================================
 struct LOAD_CLOCK : Sequence<LOAD_CLOCK, I<OPCODE_LOAD_CLOCK, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
-    // It'd be cool to call QueryPerformanceCounter directly, but w/e.
-    e.CallNative(LoadClock);
-    e.mov(i.dest, e.rax);
+    // When scaling is disabled and the raw clock source is selected, the code
+    // in the Clock class is actually just forwarding tick counts after one
+    // simple multiply and division. In that case we rather bake the scaling in
+    // here to cut extra function calls with CPU cache misses and stack frame
+    // overhead.
+    if (cvars::clock_no_scaling && cvars::clock_source_raw) {
+      auto ratio = Clock::guest_tick_ratio();
+      // The 360 CPU is an in-order CPU, AMD64 usually isn't. Without
+      // mfence/lfence magic the rdtsc instruction can be executed sooner or
+      // later in the cache window. Since it's resolution however is much higher
+      // than the 360's mftb instruction this can safely be ignored.
+
+      // Read time stamp in edx (high part) and eax (low part).
+      e.rdtsc();
+      // Make it a 64 bit number in rax.
+      e.shl(e.rdx, 32);
+      e.or_(e.rax, e.rdx);
+      // Apply tick frequency scaling.
+      e.mov(e.rcx, ratio.first);
+      e.mul(e.rcx);
+      // We actually now have a 128 bit number in rdx:rax.
+      e.mov(e.rcx, ratio.second);
+      e.div(e.rcx);
+      e.mov(i.dest, e.rax);
+    } else {
+      e.CallNative(LoadClock);
+      e.mov(i.dest, e.rax);
+    }
  }
  static uint64_t LoadClock(void* raw_context) {
    return Clock::QueryGuestTickCount();