From dd53be15ae7730084a645eb65d92bdfbd26e2da7 Mon Sep 17 00:00:00 2001 From: misson20000 Date: Tue, 6 Mar 2018 19:49:43 -0800 Subject: [PATCH 1/4] let scalers output in ABGR8888 --- libretro-common/gfx/scaler/scaler.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libretro-common/gfx/scaler/scaler.c b/libretro-common/gfx/scaler/scaler.c index eb0a0e429f..24b2d0c919 100644 --- a/libretro-common/gfx/scaler/scaler.c +++ b/libretro-common/gfx/scaler/scaler.c @@ -239,6 +239,10 @@ bool scaler_ctx_gen_filter(struct scaler_ctx *ctx) ctx->out_pixconv = conv_argb8888_bgr24; break; + case SCALER_FMT_ABGR8888: + ctx->out_pixconv = conv_argb8888_abgr8888; + break; + default: return false; } From 4a4de745186e5cdfd2e604e6682e4df95aae0c52 Mon Sep 17 00:00:00 2001 From: misson20000 Date: Tue, 6 Mar 2018 19:50:14 -0800 Subject: [PATCH 2/4] NSW(gfx): clear image row-by row for better caching (it's seriously a LOT faster), do pixel format conversion in scalers, and remove timing debug messages --- gfx/drivers/switch_gfx.c | 56 +++++++--------------------------------- 1 file changed, 10 insertions(+), 46 deletions(-) diff --git a/gfx/drivers/switch_gfx.c b/gfx/drivers/switch_gfx.c index ac3682dfa4..fb0b4f5299 100644 --- a/gfx/drivers/switch_gfx.c +++ b/gfx/drivers/switch_gfx.c @@ -131,8 +131,6 @@ static bool switch_frame(void *data, const void *frame, unsigned x, y; result_t r; - uint64_t begin, done_copying, post_vsync, pre_swizzle, post_swizzle, - copy_ms, swizzle_ms, vsync_ms; int tgtw, tgth, centerx, centery; uint32_t *out_buffer = NULL; switch_video_t *sw = data; @@ -148,12 +146,10 @@ static bool switch_frame(void *data, const void *frame, centerx = (1280-tgtw)/2; centery = (720-tgth)/2; - begin = svcGetSystemTick(); - // clear image to black - for(x = 0; x < 1280; x++) + for(y = 0; y < 720; y++) { - for(y = 0; y < 720; y++) + for(x = 0; x < 1280; x++) { sw->image[y*1280+x] = 0xFF000000; } @@ -173,7 +169,7 @@ static bool switch_frame(void *data, const void *frame, sw->scaler.out_width = tgtw; sw->scaler.out_height = tgth; sw->scaler.out_stride = 1280 * sizeof(uint32_t); - sw->scaler.out_fmt = SCALER_FMT_ARGB8888; + sw->scaler.out_fmt = SCALER_FMT_ABGR8888; sw->scaler.scaler_type = SCALER_TYPE_POINT; @@ -185,7 +181,7 @@ static bool switch_frame(void *data, const void *frame, sw->last_width = width; sw->last_height = height; } - + scaler_ctx_scale(&sw->scaler, sw->image + (centery * 1280) + centerx, frame); } @@ -213,23 +209,6 @@ static bool switch_frame(void *data, const void *frame, } #endif - for(x = 0; x < 1280; x++) - { - for(y = 0; y < 720; y++) - { - // swizzle components - uint32_t *pixel = &sw->image[(y*1280) + x]; - uint32_t src = *pixel; - uint8_t a = (src & 0xFF000000) >> 24; - uint8_t r = (src & 0x00FF0000) >> 16; - uint8_t g = (src & 0x0000FF00) >> 8; - uint8_t b = (src & 0x000000FF) >> 0; - *pixel = (a << 24) | (b << 16) | (g << 8) | (r << 0); - } - } - - done_copying = svcGetSystemTick(); - #if 0 if (frame_count > 6000) { @@ -245,26 +224,16 @@ static bool switch_frame(void *data, const void *frame, if (sw->vsync) /* vsync seems to sometimes return before the buffer has actually been dequeued? */ switch_wait_vsync(sw); - post_vsync = svcGetSystemTick(); - r = surface_dequeue_buffer(&sw->surface, &out_buffer); } while(r != RESULT_OK); - pre_swizzle = svcGetSystemTick(); gfx_slow_swizzling_blit(out_buffer, sw->image, 1280, 720, 0, 0); - post_swizzle = svcGetSystemTick(); - + r = surface_queue_buffer(&sw->surface); if (r != RESULT_OK) return false; - copy_ms = (done_copying - begin) / 19200; - swizzle_ms = (post_swizzle - pre_swizzle) / 19200; - vsync_ms = (post_vsync - done_copying) / 19200; - - RARCH_LOG("frame %d benchmark: copy %ld ms, swizzle %ld ms, vsync %ld ms\n", frame_count, copy_ms, swizzle_ms, vsync_ms); - last_frame = svcGetSystemTick(); return true; } @@ -354,7 +323,7 @@ static void switch_set_texture_frame( if (sw->menu_texture.pixels) free(sw->menu_texture.pixels); - sw->menu_texture.pixels = malloc(width * height * 4); + sw->menu_texture.pixels = malloc(width * height * (rgb32 ? 4 : 2)); if (!sw->menu_texture.pixels) { RARCH_ERR("failed to allocate buffer for menu texture\n"); @@ -378,13 +347,13 @@ static void switch_set_texture_frame( sctx->in_width = width; sctx->in_height = height; - sctx->in_stride = width * 4; - sctx->in_fmt = SCALER_FMT_ARGB8888; + sctx->in_stride = width * (rgb32 ? 4 : 2); + sctx->in_fmt = rgb32 ? SCALER_FMT_ARGB8888 : SCALER_FMT_RGB565; sctx->out_width = sw->menu_texture.tgtw; sctx->out_height = sw->menu_texture.tgth; sctx->out_stride = 1280 * 4; - sctx->out_fmt = SCALER_FMT_ARGB8888; + sctx->out_fmt = SCALER_FMT_ABGR8888; sctx->scaler_type = SCALER_TYPE_POINT; @@ -395,12 +364,7 @@ static void switch_set_texture_frame( } } - if (rgb32) - memcpy(sw->menu_texture.pixels, frame, width * height * 4); - else - conv_rgb565_argb8888(sw->menu_texture.pixels, frame, - width, height, - width * sizeof(uint32_t), width * sizeof(uint16_t)); + memcpy(sw->menu_texture.pixels, frame, width * height * (rgb32 ? 4 : 2)); } static void switch_set_texture_enable(void *data, bool enable, bool full_screen) From e934f1106509a2ffb325f170551361971aba7876 Mon Sep 17 00:00:00 2001 From: misson20000 Date: Tue, 6 Mar 2018 20:55:40 -0800 Subject: [PATCH 3/4] NSW: skip frames if we can't acquire a buffer (newer libtransistors use three buffers, making this very unlikely to happen) --- gfx/drivers/switch_gfx.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/gfx/drivers/switch_gfx.c b/gfx/drivers/switch_gfx.c index fb0b4f5299..91c063b357 100644 --- a/gfx/drivers/switch_gfx.c +++ b/gfx/drivers/switch_gfx.c @@ -220,17 +220,18 @@ static bool switch_frame(void *data, const void *frame, if (msg && strlen(msg) > 0) RARCH_LOG("message: %s\n", msg); - do { - if (sw->vsync) /* vsync seems to sometimes return before the buffer has actually been dequeued? */ - switch_wait_vsync(sw); - - r = surface_dequeue_buffer(&sw->surface, &out_buffer); - } while(r != RESULT_OK); - + r = surface_dequeue_buffer(&sw->surface, &out_buffer); + if (sw->vsync) + switch_wait_vsync(sw); + svcSleepThread(10000); + if(r != RESULT_OK) { + return true; // just skip the frame + } + gfx_slow_swizzling_blit(out_buffer, sw->image, 1280, 720, 0, 0); r = surface_queue_buffer(&sw->surface); - + if (r != RESULT_OK) return false; From 35f796d65dc7816dedf26d50a79a43aff9de7725 Mon Sep 17 00:00:00 2001 From: misson20000 Date: Tue, 6 Mar 2018 21:18:42 -0800 Subject: [PATCH 4/4] NSW(audio): use alloc_pages to acquire buffers instead of using ones in bss --- audio/drivers/switch_audio.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/audio/drivers/switch_audio.c b/audio/drivers/switch_audio.c index 063c5a70c8..3a06cc7398 100644 --- a/audio/drivers/switch_audio.c +++ b/audio/drivers/switch_audio.c @@ -18,6 +18,7 @@ #include #include +#include #include "../audio_driver.h" #include "../../verbosity.h" @@ -27,12 +28,6 @@ static const int max_num_samples = sample_rate; static const int num_channels = 2; static const size_t sample_buffer_size = ((max_num_samples * num_channels * sizeof(uint16_t)) + 0xfff) & ~0xfff; -/* don't think this can be in mapped memory, since samples get DMA'd out of it */ -static uint16_t __attribute__((aligned(0x1000))) sample_buffer_1[sample_buffer_size/sizeof(uint16_t)]; -static uint16_t __attribute__((aligned(0x1000))) sample_buffer_2[sample_buffer_size/sizeof(uint16_t)]; -static uint16_t __attribute__((aligned(0x1000))) sample_buffer_3[sample_buffer_size/sizeof(uint16_t)]; -static uint16_t *sample_buffers[3] = {sample_buffer_1, sample_buffer_2, sample_buffer_3}; - typedef struct { audio_output_t output; @@ -254,11 +249,14 @@ static void *switch_audio_init(const char *device, for(i = 0; i < 3; i++) { swa->buffers[i].ptr = &swa->buffers[i].sample_data; - swa->buffers[i].sample_data = sample_buffers[i]; + swa->buffers[i].sample_data = alloc_pages(sample_buffer_size, sample_buffer_size, NULL); swa->buffers[i].buffer_size = sample_buffer_size; swa->buffers[i].data_size = sample_buffer_size; swa->buffers[i].unknown = 0; + if(swa->buffers[i].sample_data == NULL) + goto fail_audio_output; + if (audio_ipc_output_append_buffer(&swa->output, &swa->buffers[i]) != RESULT_OK) goto fail_audio_output; }