use the host clock to smooth out frame timings when audio latency is high

2017-06-26 00:46:39 -04:00 · 2017-06-26 00:46:39 -04:00 · f2ceb7f637
parent 089c3016cc
commit f2ceb7f637
9 changed files with 74 additions and 24 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -103,7 +103,7 @@ list(APPEND RELIB_LIBS inih)
 if(ARCH_A64)
  file(GLOB VIXL_SOURCES deps/vixl/src/*.cc deps/vixl/src/aarch64/*.cc)
  add_library(vixl STATIC ${VIXL_SOURCES})
-  target_compile_definitions(vixl PRIVATE -DVIXL_CODE_BUFFER_MALLOC)
+  target_compile_definitions(vixl PRIVATE VIXL_CODE_BUFFER_STATIC)
  list(APPEND RELIB_INCLUDES deps/vixl/src)
  list(APPEND RELIB_LIBS vixl)
 endif()
--- a/src/core/core.h
+++ b/src/core/core.h
@ -7,6 +7,13 @@

 #define array_size(arr) (int)(sizeof(arr) / sizeof((arr)[0]))

+#if COMPILER_MSVC
+#define ALIGNED(x) __declspec(align(x))
+#else
+#define ALIGNED(x) __attribute__((aligned(x)))
+#endif
+
+/* macro for accessing the parent struct of a given pointer */
 #if PLATFORM_WINDOWS

 static inline void *container_of_(void *ptr, ptrdiff_t offset) {
--- a/src/guest/arm7/arm7.c
+++ b/src/guest/arm7/arm7.c
@ -173,15 +173,11 @@ static int arm7_init(struct device *dev) {
  struct arm7 *arm = (struct arm7 *)dev;
  struct dreamcast *dc = arm->dc;

-  /* place code buffer in data segment (as opposed to allocating on the heap) to
-     keep it within 2 GB of the code segment, enabling the x64 backend to use
-     RIP-relative offsets when calling functions */
-  static uint8_t arm7_code[0x800000];
-
  /* initialize jit and its interfaces */
  arm->frontend = armv3_frontend_create();

 #if ARCH_X64
+  DEFINE_JIT_CODE_BUFFER(arm7_code);
  arm->backend = x64_backend_create(arm7_code, sizeof(arm7_code));
 #else
  arm->backend = interp_backend_create();
--- a/src/guest/sh4/sh4.c
+++ b/src/guest/sh4/sh4.c
@ -138,14 +138,11 @@ static int sh4_init(struct device *dev) {
  struct sh4 *sh4 = (struct sh4 *)dev;
  struct dreamcast *dc = sh4->dc;

-  /* place code buffer in data segment (as opposed to allocating on the heap) to
-     keep it within 2 GB of the code segment, enabling the x64 backend to use
-     RIP-relative offsets when calling functions */
-  static uint8_t sh4_code[0x800000];
-
+  /* initialize jit and its interfaces */
  sh4->frontend = sh4_frontend_create();

 #if ARCH_X64
+  DEFINE_JIT_CODE_BUFFER(sh4_code);
  sh4->backend = x64_backend_create(sh4_code, sizeof(sh4_code));
 #else
  sh4->backend = interp_backend_create();
--- a/src/host/sdl_host.c
+++ b/src/host/sdl_host.c
@ -5,8 +5,8 @@
 #include "core/filesystem.h"
 #include "core/log.h"
 #include "core/option.h"
-#include "core/profiler.h"
 #include "core/ringbuf.h"
+#include "core/time.h"
 #include "emulator.h"
 #include "host/host.h"
 #include "render/render_backend.h"
@ -23,6 +23,7 @@ DEFINE_OPTION_INT(latency, 50, "Preferred audio latency in ms");
 #define AUDIO_FRAMES_TO_MS(frames) \
  (int)(((float)frames * 1000.0f) / (float)AUDIO_FREQ)
 #define MS_TO_AUDIO_FRAMES(ms) (int)(((float)(ms) / 1000.0f) * AUDIO_FREQ)
+#define NS_TO_AUDIO_FRAMES(ns) (int)(((float)(ns) / NS_PER_SEC) * AUDIO_FREQ)

 /*
 * sdl host implementation
@ -38,6 +39,7 @@ struct sdl_host {
  SDL_AudioDeviceID audio_dev;
  SDL_AudioSpec audio_spec;
  struct ringbuf *audio_frames;
+  volatile int64_t audio_last_callback;

  int key_map[K_NUM_KEYS];
  SDL_GameController *controllers[INPUT_MAX_CONTROLLERS];
@ -86,8 +88,30 @@ static int audio_buffer_low(struct sdl_host *host) {
    return 1;
  }

+  /* SDL's write callback is called very coarsely, seemingly, only each time
+     its buffered data has completely drained
+
+     since the main loop is designed to synchronize speed based on the amount
+     of buffered audio data, with larger buffer sizes (due to a larger latency
+     setting) this can result in the callback being called only one time for
+     multiple video frames
+
+     this creates a situation where multiple video frames are immediately ran
+     when the callback fires in order to push enough audio data to avoid an
+     underflow, and then multiple vblanks occur on the host where no new frame
+     is presented as the main loop again blocks waiting for another write
+     callback to decrease the amount of buffered audio data
+
+     in order to smooth out the video frame timings when the audio latency is
+     high, the host clock is used to interpolate the amount of available audio
+     data between callbacks */
+  int64_t now = time_nanoseconds();
+  int64_t since_last_callback = now - host->audio_last_callback;
+  int frames_available = audio_available_frames(host);
+  frames_available -= NS_TO_AUDIO_FRAMES(since_last_callback);
+
  int low_water_mark = host->audio_spec.samples;
-  return audio_available_frames(host) <= low_water_mark;
+  return frames_available <= low_water_mark;
 }

 static void audio_write_callback(void *userdata, Uint8 *stream, int len) {
@ -109,6 +133,8 @@ static void audio_write_callback(void *userdata, Uint8 *stream, int len) {
    /* copy frames to output stream */
    memcpy(buf, tmp, n * frame_size);
  }
+
+  host->audio_last_callback = time_nanoseconds();
 }

 void audio_push(struct host *base, const int16_t *data, int num_frames) {
@ -152,8 +178,12 @@ static int audio_init(struct sdl_host *host) {
    return 0;
  }

-  /* create ringbuffer to store data coming in from AICA */
-  host->audio_frames = ringbuf_create(host->audio_spec.samples * 4);
+  /* create ringbuffer to store data coming in from AICA. note, the buffer needs
+     to be at least two video frames in size, in order to handle the coarse
+     synchronization used by the main loop, where an entire guest video frame is
+     ran when the available audio data is deemed low */
+  static const int frame_size = 2 * 2;
+  host->audio_frames = ringbuf_create(AUDIO_FREQ * frame_size);

  /* resume device */
  SDL_PauseAudioDevice(host->audio_dev, 0);
--- a/src/jit/backend/jit_backend.h
+++ b/src/jit/backend/jit_backend.h
@ -14,6 +14,23 @@ struct jit_register {
  const void *data;
 };

+/* macro to help declare a code buffer for the backends to use
+
+   note, the code buffer needs to be placed in the data segment (as opposed to
+   allocating on the heap) to keep it within 2 GB of the code segment, enabling
+   the x64 backend to use RIP-relative offsets when calling functions
+
+   further, the code buffer needs to be no greater than 1 MB in size so the a64
+   backend can use conditional branches to thunks without trampolining
+
+   finally, the code buffer needs to be aligned to a 4kb page so it's easy to
+   mprotect */
+#if ARCH_A64
+#define DEFINE_JIT_CODE_BUFFER(name) static uint8_t name[0x100000] ALIGNED(4096)
+#else
+#define DEFINE_JIT_CODE_BUFFER(name) static uint8_t name[0x800000] ALIGNED(4096)
+#endif
+
 struct jit_backend {
  struct jit *jit;

--- a/src/jit/backend/x64/x64_backend.cc
+++ b/src/jit/backend/x64/x64_backend.cc
@ -11,9 +11,6 @@ extern "C" {
 #include "jit/jit.h"
 }

-/* size of codegen buffer reserved for thunks */
-#define X64_THUNK_SIZE 1024
-
 /*
 * x64 register layout
 */
@ -353,8 +350,8 @@ static void x64_backend_emit_prologue(struct x64_backend *backend,
  e.add(e.dword[guestctx + guest->offset_instrs], block->num_instrs);
 }

-static void *x64_backend_emit(struct x64_backend *backend,
-                              struct jit_block *block, struct ir *ir) {
+static void x64_backend_emit(struct x64_backend *backend,
+                             struct jit_block *block, struct ir *ir) {
  auto &e = *backend->codegen;
  const uint8_t *code = backend->codegen->getCurr();

@ -396,9 +393,8 @@ static void *x64_backend_emit(struct x64_backend *backend,

  e.outLocalLabel();

+  block->host_addr = (void *)code;
  block->host_size = (int)(backend->codegen->getCurr() - code);
-
-  return (void *)code;
 }

 static void x64_backend_emit_thunks(struct x64_backend *backend) {
@ -580,7 +576,7 @@ static int x64_backend_assemble_code(struct jit_backend *base,
  /* try to generate the x64 code. if the code buffer overflows let the backend
     know so it can reset the cache and try again */
  try {
-    block->host_addr = x64_backend_emit(backend, block, ir);
+    x64_backend_emit(backend, block, ir);
  } catch (const Xbyak::Error &e) {
    if (e != Xbyak::ERR_CODE_IS_TOO_BIG) {
      LOG_FATAL("x64 codegen failure, %s", e.what());
@ -630,7 +626,8 @@ struct jit_backend *x64_backend_create(void *code, int code_size) {
      calloc(1, sizeof(struct x64_backend)));
  Xbyak::util::Cpu cpu;

-  CHECK(Xbyak::CodeArray::protect(code, code_size, true));
+  int r = protect_pages(code, code_size, ACC_READWRITEEXEC);
+  CHECK(r);

  backend->base.init = &x64_backend_init;
  backend->base.destroy = &x64_backend_destroy;
--- a/src/jit/backend/x64/x64_local.h
+++ b/src/jit/backend/x64/x64_local.h
@ -50,6 +50,8 @@ struct x64_backend {
 /*
 * backend functionality used by emitters
 */
+#define X64_THUNK_SIZE 1024
+
 #if PLATFORM_WINDOWS
 #define X64_STACK_SHADOW_SPACE 32
 #else
--- a/src/jit/jit.c
+++ b/src/jit/jit.c
@ -327,6 +327,10 @@ void jit_compile_block(struct jit *jit, uint32_t guest_addr) {
  int res = jit->backend->assemble_code(jit->backend, block, &ir);

  if (res) {
+#if 0
+    jit->backend->dump_code(jit->backend, block);
+#endif
+
    jit_finalize_block(jit, block);
  } else {
    /* if the backend overflowed, completely free the cache and let dispatch