diff --git a/CMakeLists.txt b/CMakeLists.txt index d92fdd17..da9dc658 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,7 @@ list(APPEND RELIB_LIBS inih) if(ARCH_A64) file(GLOB VIXL_SOURCES deps/vixl/src/*.cc deps/vixl/src/aarch64/*.cc) add_library(vixl STATIC ${VIXL_SOURCES}) - target_compile_definitions(vixl PRIVATE -DVIXL_CODE_BUFFER_MALLOC) + target_compile_definitions(vixl PRIVATE VIXL_CODE_BUFFER_STATIC) list(APPEND RELIB_INCLUDES deps/vixl/src) list(APPEND RELIB_LIBS vixl) endif() diff --git a/src/core/core.h b/src/core/core.h index 8462072f..b1f7b2dc 100644 --- a/src/core/core.h +++ b/src/core/core.h @@ -7,6 +7,13 @@ #define array_size(arr) (int)(sizeof(arr) / sizeof((arr)[0])) +#if COMPILER_MSVC +#define ALIGNED(x) __declspec(align(x)) +#else +#define ALIGNED(x) __attribute__((aligned(x))) +#endif + +/* macro for accessing the parent struct of a given pointer */ #if PLATFORM_WINDOWS static inline void *container_of_(void *ptr, ptrdiff_t offset) { diff --git a/src/guest/arm7/arm7.c b/src/guest/arm7/arm7.c index 810f4cf6..3e202ede 100644 --- a/src/guest/arm7/arm7.c +++ b/src/guest/arm7/arm7.c @@ -173,15 +173,11 @@ static int arm7_init(struct device *dev) { struct arm7 *arm = (struct arm7 *)dev; struct dreamcast *dc = arm->dc; - /* place code buffer in data segment (as opposed to allocating on the heap) to - keep it within 2 GB of the code segment, enabling the x64 backend to use - RIP-relative offsets when calling functions */ - static uint8_t arm7_code[0x800000]; - /* initialize jit and its interfaces */ arm->frontend = armv3_frontend_create(); #if ARCH_X64 + DEFINE_JIT_CODE_BUFFER(arm7_code); arm->backend = x64_backend_create(arm7_code, sizeof(arm7_code)); #else arm->backend = interp_backend_create(); diff --git a/src/guest/sh4/sh4.c b/src/guest/sh4/sh4.c index 240c794a..216cb3a2 100644 --- a/src/guest/sh4/sh4.c +++ b/src/guest/sh4/sh4.c @@ -138,14 +138,11 @@ static int sh4_init(struct device *dev) { struct sh4 *sh4 = (struct sh4 *)dev; struct dreamcast *dc = sh4->dc; - /* place code buffer in data segment (as opposed to allocating on the heap) to - keep it within 2 GB of the code segment, enabling the x64 backend to use - RIP-relative offsets when calling functions */ - static uint8_t sh4_code[0x800000]; - + /* initialize jit and its interfaces */ sh4->frontend = sh4_frontend_create(); #if ARCH_X64 + DEFINE_JIT_CODE_BUFFER(sh4_code); sh4->backend = x64_backend_create(sh4_code, sizeof(sh4_code)); #else sh4->backend = interp_backend_create(); diff --git a/src/host/sdl_host.c b/src/host/sdl_host.c index ece7370a..ed809d78 100644 --- a/src/host/sdl_host.c +++ b/src/host/sdl_host.c @@ -5,8 +5,8 @@ #include "core/filesystem.h" #include "core/log.h" #include "core/option.h" -#include "core/profiler.h" #include "core/ringbuf.h" +#include "core/time.h" #include "emulator.h" #include "host/host.h" #include "render/render_backend.h" @@ -23,6 +23,7 @@ DEFINE_OPTION_INT(latency, 50, "Preferred audio latency in ms"); #define AUDIO_FRAMES_TO_MS(frames) \ (int)(((float)frames * 1000.0f) / (float)AUDIO_FREQ) #define MS_TO_AUDIO_FRAMES(ms) (int)(((float)(ms) / 1000.0f) * AUDIO_FREQ) +#define NS_TO_AUDIO_FRAMES(ns) (int)(((float)(ns) / NS_PER_SEC) * AUDIO_FREQ) /* * sdl host implementation @@ -38,6 +39,7 @@ struct sdl_host { SDL_AudioDeviceID audio_dev; SDL_AudioSpec audio_spec; struct ringbuf *audio_frames; + volatile int64_t audio_last_callback; int key_map[K_NUM_KEYS]; SDL_GameController *controllers[INPUT_MAX_CONTROLLERS]; @@ -86,8 +88,30 @@ static int audio_buffer_low(struct sdl_host *host) { return 1; } + /* SDL's write callback is called very coarsely, seemingly, only each time + its buffered data has completely drained + + since the main loop is designed to synchronize speed based on the amount + of buffered audio data, with larger buffer sizes (due to a larger latency + setting) this can result in the callback being called only one time for + multiple video frames + + this creates a situation where multiple video frames are immediately ran + when the callback fires in order to push enough audio data to avoid an + underflow, and then multiple vblanks occur on the host where no new frame + is presented as the main loop again blocks waiting for another write + callback to decrease the amount of buffered audio data + + in order to smooth out the video frame timings when the audio latency is + high, the host clock is used to interpolate the amount of available audio + data between callbacks */ + int64_t now = time_nanoseconds(); + int64_t since_last_callback = now - host->audio_last_callback; + int frames_available = audio_available_frames(host); + frames_available -= NS_TO_AUDIO_FRAMES(since_last_callback); + int low_water_mark = host->audio_spec.samples; - return audio_available_frames(host) <= low_water_mark; + return frames_available <= low_water_mark; } static void audio_write_callback(void *userdata, Uint8 *stream, int len) { @@ -109,6 +133,8 @@ static void audio_write_callback(void *userdata, Uint8 *stream, int len) { /* copy frames to output stream */ memcpy(buf, tmp, n * frame_size); } + + host->audio_last_callback = time_nanoseconds(); } void audio_push(struct host *base, const int16_t *data, int num_frames) { @@ -152,8 +178,12 @@ static int audio_init(struct sdl_host *host) { return 0; } - /* create ringbuffer to store data coming in from AICA */ - host->audio_frames = ringbuf_create(host->audio_spec.samples * 4); + /* create ringbuffer to store data coming in from AICA. note, the buffer needs + to be at least two video frames in size, in order to handle the coarse + synchronization used by the main loop, where an entire guest video frame is + ran when the available audio data is deemed low */ + static const int frame_size = 2 * 2; + host->audio_frames = ringbuf_create(AUDIO_FREQ * frame_size); /* resume device */ SDL_PauseAudioDevice(host->audio_dev, 0); diff --git a/src/jit/backend/jit_backend.h b/src/jit/backend/jit_backend.h index c4bad29e..70950211 100644 --- a/src/jit/backend/jit_backend.h +++ b/src/jit/backend/jit_backend.h @@ -14,6 +14,23 @@ struct jit_register { const void *data; }; +/* macro to help declare a code buffer for the backends to use + + note, the code buffer needs to be placed in the data segment (as opposed to + allocating on the heap) to keep it within 2 GB of the code segment, enabling + the x64 backend to use RIP-relative offsets when calling functions + + further, the code buffer needs to be no greater than 1 MB in size so the a64 + backend can use conditional branches to thunks without trampolining + + finally, the code buffer needs to be aligned to a 4kb page so it's easy to + mprotect */ +#if ARCH_A64 +#define DEFINE_JIT_CODE_BUFFER(name) static uint8_t name[0x100000] ALIGNED(4096) +#else +#define DEFINE_JIT_CODE_BUFFER(name) static uint8_t name[0x800000] ALIGNED(4096) +#endif + struct jit_backend { struct jit *jit; diff --git a/src/jit/backend/x64/x64_backend.cc b/src/jit/backend/x64/x64_backend.cc index 13794162..976be795 100644 --- a/src/jit/backend/x64/x64_backend.cc +++ b/src/jit/backend/x64/x64_backend.cc @@ -11,9 +11,6 @@ extern "C" { #include "jit/jit.h" } -/* size of codegen buffer reserved for thunks */ -#define X64_THUNK_SIZE 1024 - /* * x64 register layout */ @@ -353,8 +350,8 @@ static void x64_backend_emit_prologue(struct x64_backend *backend, e.add(e.dword[guestctx + guest->offset_instrs], block->num_instrs); } -static void *x64_backend_emit(struct x64_backend *backend, - struct jit_block *block, struct ir *ir) { +static void x64_backend_emit(struct x64_backend *backend, + struct jit_block *block, struct ir *ir) { auto &e = *backend->codegen; const uint8_t *code = backend->codegen->getCurr(); @@ -396,9 +393,8 @@ static void *x64_backend_emit(struct x64_backend *backend, e.outLocalLabel(); + block->host_addr = (void *)code; block->host_size = (int)(backend->codegen->getCurr() - code); - - return (void *)code; } static void x64_backend_emit_thunks(struct x64_backend *backend) { @@ -580,7 +576,7 @@ static int x64_backend_assemble_code(struct jit_backend *base, /* try to generate the x64 code. if the code buffer overflows let the backend know so it can reset the cache and try again */ try { - block->host_addr = x64_backend_emit(backend, block, ir); + x64_backend_emit(backend, block, ir); } catch (const Xbyak::Error &e) { if (e != Xbyak::ERR_CODE_IS_TOO_BIG) { LOG_FATAL("x64 codegen failure, %s", e.what()); @@ -630,7 +626,8 @@ struct jit_backend *x64_backend_create(void *code, int code_size) { calloc(1, sizeof(struct x64_backend))); Xbyak::util::Cpu cpu; - CHECK(Xbyak::CodeArray::protect(code, code_size, true)); + int r = protect_pages(code, code_size, ACC_READWRITEEXEC); + CHECK(r); backend->base.init = &x64_backend_init; backend->base.destroy = &x64_backend_destroy; diff --git a/src/jit/backend/x64/x64_local.h b/src/jit/backend/x64/x64_local.h index 80e48437..f744d017 100644 --- a/src/jit/backend/x64/x64_local.h +++ b/src/jit/backend/x64/x64_local.h @@ -50,6 +50,8 @@ struct x64_backend { /* * backend functionality used by emitters */ +#define X64_THUNK_SIZE 1024 + #if PLATFORM_WINDOWS #define X64_STACK_SHADOW_SPACE 32 #else diff --git a/src/jit/jit.c b/src/jit/jit.c index 96619039..2a85ebf7 100644 --- a/src/jit/jit.c +++ b/src/jit/jit.c @@ -327,6 +327,10 @@ void jit_compile_block(struct jit *jit, uint32_t guest_addr) { int res = jit->backend->assemble_code(jit->backend, block, &ir); if (res) { +#if 0 + jit->backend->dump_code(jit->backend, block); +#endif + jit_finalize_block(jit, block); } else { /* if the backend overflowed, completely free the cache and let dispatch