diff --git a/Makefile b/Makefile
index a0c1956594..bc63cbc31e 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,11 @@ OBJ = retroarch.o \
 		audio/null.o \
 		gfx/null.o \
 		input/null.o \
-		screenshot.o
+		screenshot.o \
+		gfx/scaler/scaler.o \
+		gfx/scaler/pixconv.o \
+		gfx/scaler/scaler_int.o \
+		gfx/scaler/filter.o
 
 JOYCONFIG_OBJ = tools/retroarch-joyconfig.o \
 	conf/config_file.o \
@@ -142,14 +146,6 @@ ifeq ($(PERF_TEST), 1)
    OBJ += benchmark.o
 endif
 
-ifeq ($(HAVE_SDL), 1)
-   OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
-else ifeq ($(HAVE_OPENGL), 1)
-   OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
-else ifeq ($(HAVE_FFMPEG), 1)
-   OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
-endif
-
 ifeq ($(HAVE_SDL), 1)
    OBJ += gfx/sdl_gfx.o input/sdl_input.o input/sdl_joypad.o audio/sdl_audio.o
    JOYCONFIG_OBJ += input/sdl_joypad.o
diff --git a/Makefile.win b/Makefile.win
index fd01396af5..f7b675cbad 100644
--- a/Makefile.win
+++ b/Makefile.win
@@ -22,7 +22,11 @@ OBJ = retroarch.o \
 		input/null.o \
 		fifo_buffer.o \
 		gfx/null.o \
-		media/rarch.o
+		media/rarch.o \
+		gfx/scaler/scaler.o \
+		gfx/scaler/pixconv.o \
+		gfx/scaler/scaler_int.o \
+		gfx/scaler/filter.o
 
 JOBJ := conf/config_file.o \
 	tools/retroarch-joyconfig.o \
@@ -75,14 +79,6 @@ ifeq ($(PERF_TEST), 1)
    OBJ += benchmark.o
 endif
 
-ifeq ($(HAVE_SDL), 1)
-   OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
-else ifeq ($(HAVE_OPENGL), 1)
-   OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
-else ifeq ($(HAVE_FFMPEG), 1)
-   OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
-endif
-
 JLIBS =
 
 ifeq ($(HAVE_SDL), 1)
diff --git a/driver.c b/driver.c
index 000fa75350..a4fafb86c7 100644
--- a/driver.c
+++ b/driver.c
@@ -430,9 +430,9 @@ static void init_filter(void)
    if (*g_settings.video.filter_path == '\0')
       return;
 
-   if (g_extern.system.rgb32)
+   if (g_extern.system.pix_fmt != RETRO_PIXEL_FORMAT_0RGB1555)
    {
-      RARCH_WARN("libretro implementation uses XRGB8888 format. CPU filters only support 0RGB1555.\n");
+      RARCH_WARN("CPU filters only support 0RGB1555.\n");
       return;
    }
 
@@ -536,6 +536,34 @@ static void init_shader_dir(void)
 }
 #endif
 
+static bool init_video_pixel_converter(unsigned size)
+{
+   if (g_extern.system.pix_fmt == RETRO_PIXEL_FORMAT_0RGB1555)
+   {
+      RARCH_WARN("0RGB1555 pixel format is deprecated, and will be slower. For 15/16-bit, RGB565 format is preferred.\n");
+
+      // We'll tweak these values later,
+      // just set most of them to something sane to begin with.
+      driver.scaler.in_width = 
+         driver.scaler.in_height = 
+         driver.scaler.out_width = 
+         driver.scaler.out_height = size;
+
+      driver.scaler.scaler_type = SCALER_TYPE_POINT;
+      driver.scaler.in_fmt      = SCALER_FMT_0RGB1555;
+
+      // TODO: Pick either ARGB8888 or RGB565 depending on driver ...
+      driver.scaler.out_fmt     = SCALER_FMT_RGB565;
+
+      if (!scaler_ctx_gen_filter(&driver.scaler))
+         return false;
+
+      driver.scaler_out = calloc(sizeof(uint16_t), size * size);
+   }
+
+   return true;
+}
+
 void init_video_input(void)
 {
 #ifdef HAVE_DYLIB
@@ -585,12 +613,21 @@ void init_video_input(void)
       }
    }
 
-   RARCH_LOG("Video @ %ux%u\n", width, height);
+   if (width && height)
+      RARCH_LOG("Video @ %ux%u\n", width, height);
+   else
+      RARCH_LOG("Video @ fullscreen\n");
 
    driver.display_type  = RARCH_DISPLAY_NONE;
    driver.video_display = 0;
    driver.video_window  = 0;
 
+   if (!init_video_pixel_converter(RARCH_SCALE_BASE * scale))
+   {
+      RARCH_ERR("Failed to init pixel converter.\n");
+      rarch_fail(1, "init_video_input()");
+   }
+
    video_info_t video = {0};
    video.width = width;
    video.height = height;
@@ -599,7 +636,7 @@ void init_video_input(void)
    video.force_aspect = g_settings.video.force_aspect;
    video.smooth = g_settings.video.smooth;
    video.input_scale = scale;
-   video.rgb32 = g_extern.filter.active || g_extern.system.rgb32;
+   video.rgb32 = g_extern.filter.active || (g_extern.system.pix_fmt == RETRO_PIXEL_FORMAT_XRGB8888);
 
    const input_driver_t *tmp = driver.input;
    driver.video_data = video_init_func(&video, &driver.input, &driver.input_data);
@@ -643,6 +680,14 @@ void init_video_input(void)
    }
 }
 
+static void deinit_pixel_converter(void)
+{
+   scaler_ctx_gen_reset(&driver.scaler);
+   memset(&driver.scaler, 0, sizeof(driver.scaler));
+   free(driver.scaler_out);
+   driver.scaler_out = NULL;
+}
+
 void uninit_video_input(void)
 {
    if (driver.input_data != driver.video_data && driver.input)
@@ -651,6 +696,8 @@ void uninit_video_input(void)
    if (driver.video_data && driver.video)
       video_free_func();
 
+   deinit_pixel_converter();
+
 #ifdef HAVE_DYLIB
    deinit_filter();
 #endif
diff --git a/driver.h b/driver.h
index 3061950af9..61242ed48d 100644
--- a/driver.h
+++ b/driver.h
@@ -23,6 +23,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include "msvc/msvc_compat.h"
+#include "gfx/scaler/scaler.h"
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -240,6 +241,9 @@ typedef struct driver
    uintptr_t video_display;
    uintptr_t video_window;
    enum rarch_display_type display_type;
+
+   struct scaler_ctx scaler;
+   void *scaler_out;
 } driver_t;
 
 void init_drivers(void);
diff --git a/dynamic.c b/dynamic.c
index c8f8016a7d..746e622924 100644
--- a/dynamic.c
+++ b/dynamic.c
@@ -420,17 +420,18 @@ static bool environment_cb(unsigned cmd, void *data)
       case RETRO_ENVIRONMENT_SET_PIXEL_FORMAT:
       {
          enum retro_pixel_format pix_fmt = *(const enum retro_pixel_format*)data;
-         bool rgb32 = false;
          switch (pix_fmt)
          {
             case RETRO_PIXEL_FORMAT_0RGB1555:
-               rgb32 = false;
                RARCH_LOG("Environ SET_PIXEL_FORMAT: 0RGB1555.\n");
                break;
 
+            case RETRO_PIXEL_FORMAT_RGB565:
+               RARCH_LOG("Environ SET_PIXEL_FORMAT: RGB565.\n");
+               break;
+
 #ifndef RARCH_CONSOLE
             case RETRO_PIXEL_FORMAT_XRGB8888:
-               rgb32 = true;
                RARCH_LOG("Environ SET_PIXEL_FORMAT: XRGB8888.\n");
                break;
 #endif
@@ -438,7 +439,7 @@ static bool environment_cb(unsigned cmd, void *data)
                return false;
          }
          
-         g_extern.system.rgb32 = rgb32;
+         g_extern.system.pix_fmt = pix_fmt;
          break;
       }
 
diff --git a/general.h b/general.h
index 66ab6775d5..822e70acba 100644
--- a/general.h
+++ b/general.h
@@ -302,7 +302,7 @@ struct global
       unsigned rotation;
       bool shutdown;
       unsigned performance_level;
-      bool rgb32;
+      enum retro_pixel_format pix_fmt;
 
       bool force_nonblock;
 
diff --git a/gfx/context/vc_egl_ctx.c b/gfx/context/vc_egl_ctx.c
index 8aea660a07..675fc0c4ca 100644
--- a/gfx/context/vc_egl_ctx.c
+++ b/gfx/context/vc_egl_ctx.c
@@ -443,12 +443,12 @@ static bool gfx_ctx_write_egl_image(const void *frame, unsigned width, unsigned
 
    if (!eglBuffer[index] || !g_egl_vgimage[index])
    {
-      g_egl_vgimage[index] = vgCreateImage(VG_sXRGB_8888, g_egl_res, g_egl_res, g_smooth ? VG_IMAGE_QUALITY_BETTER : VG_IMAGE_QUALITY_NONANTIALIASED);
+      g_egl_vgimage[index] = vgCreateImage(rgb32 ? VG_sXRGB_8888 : VG_sRGB_565, g_egl_res, g_egl_res, VG_IMAGE_QUALITY_NONANTIALIASED);
       eglBuffer[index] = peglCreateImageKHR(g_egl_dpy, g_eglimage_ctx, EGL_VG_PARENT_IMAGE_KHR, (EGLClientBuffer)g_egl_vgimage[index], NULL);
       ret = true;
    }
 
-   vgImageSubData(g_egl_vgimage[index], frame, pitch, (rgb32 ? VG_sXRGB_8888 : VG_sARGB_1555), 0, 0, width, height);
+   vgImageSubData(g_egl_vgimage[index], frame, pitch, (rgb32 ? VG_sXRGB_8888 : VG_sRGB_565), 0, 0, width, height);
    *image_handle = eglBuffer[index];
 
    gfx_ctx_bind_api(g_api);
diff --git a/gfx/ext/rarch_video.h b/gfx/ext/rarch_video.h
index 04b8e057c5..456110b5b8 100644
--- a/gfx/ext/rarch_video.h
+++ b/gfx/ext/rarch_video.h
@@ -24,7 +24,7 @@ extern "C" {
 #define RARCH_API_CALLTYPE
 #endif
 
-#define RARCH_GRAPHICS_API_VERSION 4
+#define RARCH_GRAPHICS_API_VERSION 5
 
 // Since we don't want to rely on C++ or C99 for a proper boolean type,
 // make sure return semantics are perfectly clear ... ;)
@@ -45,8 +45,8 @@ extern "C" {
 #define RARCH_FALSE 0
 #endif
 
-#define RARCH_COLOR_FORMAT_XRGB1555 0
-#define RARCH_COLOR_FORMAT_ARGB8888 1
+#define RARCH_COLOR_FORMAT_RGB565 0
+#define RARCH_COLOR_FORMAT_XRGB8888 1
 
 #define RARCH_INPUT_SCALE_BASE 256
 
diff --git a/gfx/ext_gfx.c b/gfx/ext_gfx.c
index 1c4604bb1b..0d85052a6b 100644
--- a/gfx/ext_gfx.c
+++ b/gfx/ext_gfx.c
@@ -300,7 +300,7 @@ static bool setup_video(ext_t *ext, const video_info_t *video, const input_drive
    info.aspect_ratio       = g_settings.video.aspect_ratio;
    info.smooth             = video->smooth;
    info.input_scale        = video->input_scale;
-   info.color_format       = video->rgb32 ? RARCH_COLOR_FORMAT_ARGB8888 : RARCH_COLOR_FORMAT_XRGB1555;
+   info.color_format       = video->rgb32 ? RARCH_COLOR_FORMAT_XRGB8888 : RARCH_COLOR_FORMAT_RGB565;
    info.xml_shader         = xml_shader;
    info.cg_shader          = cg_shader;
    info.ttf_font           = font;
diff --git a/gfx/gl.c b/gfx/gl.c
index 8cf2caf465..9c2853688c 100644
--- a/gfx/gl.c
+++ b/gfx/gl.c
@@ -94,7 +94,7 @@ const GLfloat *default_vertex_ptr = vertexes_flipped;
    memcpy(&(pgl##SYM), &sym, sizeof(sym)); \
 }
 
-#ifdef HAVE_EGL
+#if defined(HAVE_EGL) && defined(HAVE_OPENGLES2)
 static PFNGLEGLIMAGETARGETTEXTURE2DOESPROC pglEGLImageTargetTexture2DOES;
 
 static bool load_eglimage_proc(gl_t *gl)
@@ -470,8 +470,8 @@ static void gl_create_fbo_textures(gl_t *gl)
       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter_type);
 
       glTexImage2D(GL_TEXTURE_2D,
-            0, RARCH_GL_INTERNAL_FORMAT, gl->fbo_rect[i].width, gl->fbo_rect[i].height,
-            0, RARCH_GL_TEXTURE_TYPE,
+            0, RARCH_GL_INTERNAL_FORMAT32, gl->fbo_rect[i].width, gl->fbo_rect[i].height,
+            0, RARCH_GL_TEXTURE_TYPE32,
             RARCH_GL_FORMAT32, NULL);
    }
 
@@ -720,8 +720,8 @@ static void gl_check_fbo_dimensions(gl_t *gl)
          glBindTexture(GL_TEXTURE_2D, gl->fbo_texture[i]);
 
          glTexImage2D(GL_TEXTURE_2D,
-               0, RARCH_GL_INTERNAL_FORMAT, gl->fbo_rect[i].width, gl->fbo_rect[i].height,
-               0, RARCH_GL_TEXTURE_TYPE,
+               0, RARCH_GL_INTERNAL_FORMAT32, gl->fbo_rect[i].width, gl->fbo_rect[i].height,
+               0, RARCH_GL_TEXTURE_TYPE32,
                RARCH_GL_FORMAT32, NULL);
 
          pglFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, gl->fbo_texture[i], 0);
@@ -872,8 +872,8 @@ static void gl_update_input_size(gl_t *gl, unsigned width, unsigned height, unsi
 }
 
 // It is *much* faster (order of mangnitude on my setup) to use a custom SIMD-optimized conversion routine than letting GL do it :(
-#if !defined(HAVE_PSGL)
-static inline void gl_convert_frame_rgb15_32(gl_t *gl, void *output, const void *input, int width, int height, int in_pitch)
+#if !defined(HAVE_PSGL) && !defined(HAVE_OPENGLES2)
+static inline void gl_convert_frame_rgb16_32(gl_t *gl, void *output, const void *input, int width, int height, int in_pitch)
 {
    if (width != gl->scaler.in_width || height != gl->scaler.in_height)
    {
@@ -881,7 +881,7 @@ static inline void gl_convert_frame_rgb15_32(gl_t *gl, void *output, const void
       gl->scaler.in_height   = height;
       gl->scaler.out_width   = width;
       gl->scaler.out_height  = height;
-      gl->scaler.in_fmt      = SCALER_FMT_0RGB1555;
+      gl->scaler.in_fmt      = SCALER_FMT_RGB565;
       gl->scaler.out_fmt     = SCALER_FMT_ARGB8888;
       gl->scaler.scaler_type = SCALER_TYPE_POINT;
       scaler_ctx_gen_filter(&gl->scaler);
@@ -935,6 +935,7 @@ static void gl_init_textures(gl_t *gl)
 #else
 static inline void gl_copy_frame(gl_t *gl, const void *frame, unsigned width, unsigned height, unsigned pitch)
 {
+#ifdef HAVE_OPENGLES2
 #ifdef HAVE_EGL
    if (gl->egl_images)
    {
@@ -952,17 +953,7 @@ static inline void gl_copy_frame(gl_t *gl, const void *frame, unsigned width, un
    }
    else
 #endif
-   if (gl->base_size == 2) // ARGB1555 => ARGB8888, SIMD-style :D
    {
-      glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(width * sizeof(uint32_t))); // Always use 32-bit textures.
-      gl_convert_frame_rgb15_32(gl, gl->conv_buffer, frame, width, height, pitch);
-      glTexSubImage2D(GL_TEXTURE_2D,
-            0, 0, 0, width, height, gl->texture_type,
-            gl->texture_fmt, gl->conv_buffer);
-   }
-   else
-   {
-#ifdef HAVE_OPENGLES2
       // No GL_UNPACK_ROW_LENGTH ;(
       unsigned pitch_width = pitch / gl->base_size;
       if (width == pitch_width) // Happy path :D
@@ -971,31 +962,47 @@ static inline void gl_copy_frame(gl_t *gl, const void *frame, unsigned width, un
                0, 0, 0, width, height, gl->texture_type,
                gl->texture_fmt, frame);
       }
-      else // Probably slower path.
+      else // Slower path.
       {
-         const uint32_t *src = (const uint32_t*)frame;
-         for (unsigned h = 0; h < height; h++, src += pitch_width)
+         const uint8_t *src = (const uint8_t*)frame;
+         for (unsigned h = 0; h < height; h++, src += pitch)
          {
             glTexSubImage2D(GL_TEXTURE_2D,
                   0, 0, h, width, 1, gl->texture_type,
                   gl->texture_fmt, src);
          }
       }
+   }
 #else
-      glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(pitch));
+   glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(pitch));
+   if (gl->base_size == 2)
+   {
+      // Always use 32-bit textures on desktop GL.
+      gl_convert_frame_rgb16_32(gl, gl->conv_buffer, frame, width, height, pitch);
+      glTexSubImage2D(GL_TEXTURE_2D,
+            0, 0, 0, width, height, gl->texture_type,
+            gl->texture_fmt, gl->conv_buffer);
+   }
+   else
+   {
       glPixelStorei(GL_UNPACK_ROW_LENGTH, pitch / gl->base_size);
-
       glTexSubImage2D(GL_TEXTURE_2D,
             0, 0, 0, width, height, gl->texture_type,
             gl->texture_fmt, frame);
 
       glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
-#endif
    }
+#endif
 }
 
-static void gl_init_textures(gl_t *gl)
+static void gl_init_textures(gl_t *gl, const video_info_t *video)
 {
+#if defined(HAVE_EGL) && defined(HAVE_OPENGLES2)
+   gl->egl_images = load_eglimage_proc(gl) && gl->ctx_driver->init_egl_image_buffer(video);
+#else
+   (void)video;
+#endif
+
    glGenTextures(TEXTURES, gl->texture);
    for (unsigned i = 0; i < TEXTURES; i++)
    {
@@ -1006,9 +1013,12 @@ static void gl_init_textures(gl_t *gl)
       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, gl->tex_filter);
       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, gl->tex_filter);
 
-      glTexImage2D(GL_TEXTURE_2D,
-            0, RARCH_GL_INTERNAL_FORMAT, gl->tex_w, gl->tex_h, 0, gl->texture_type,
-            gl->texture_fmt, gl->empty_buf ? gl->empty_buf : NULL);
+      if (!gl->egl_images)
+      {
+         glTexImage2D(GL_TEXTURE_2D,
+               0, gl->internal_fmt, gl->tex_w, gl->tex_h, 0, gl->texture_type,
+               gl->texture_fmt, gl->empty_buf ? gl->empty_buf : NULL);
+      }
    }
    glBindTexture(GL_TEXTURE_2D, gl->texture[gl->tex_index]);
 }
@@ -1338,7 +1348,8 @@ static void *gl_init(const video_info_t *video, const input_driver_t **input, vo
    else
       gl->tex_filter = video->smooth ? GL_LINEAR : GL_NEAREST;
 
-   gl->texture_type = RARCH_GL_TEXTURE_TYPE;
+   gl->internal_fmt = video->rgb32 ? RARCH_GL_INTERNAL_FORMAT32 : RARCH_GL_INTERNAL_FORMAT16;
+   gl->texture_type = video->rgb32 ? RARCH_GL_TEXTURE_TYPE32 : RARCH_GL_TEXTURE_TYPE16;
    gl->texture_fmt  = video->rgb32 ? RARCH_GL_FORMAT32 : RARCH_GL_FORMAT16;
    gl->base_size    = video->rgb32 ? sizeof(uint32_t) : sizeof(uint16_t);
 
@@ -1379,7 +1390,7 @@ static void *gl_init(const video_info_t *video, const input_driver_t **input, vo
    }
 #endif
 
-   gl_init_textures(gl);
+   gl_init_textures(gl, video);
 
    for (unsigned i = 0; i < TEXTURES; i++)
    {
@@ -1407,10 +1418,6 @@ static void *gl_init(const video_info_t *video, const input_driver_t **input, vo
       return NULL;
    }
 
-#ifdef HAVE_EGL
-   gl->egl_images = load_eglimage_proc(gl) && gl->ctx_driver->init_egl_image_buffer(video);
-#endif
-
    return gl;
 }
 
diff --git a/gfx/gl_common.h b/gfx/gl_common.h
index fb6c6beb29..220e722a72 100644
--- a/gfx/gl_common.h
+++ b/gfx/gl_common.h
@@ -158,7 +158,7 @@ struct gl_coords
 
 #define MAX_SHADERS 16
 
-#if defined(HAVE_GLSL) || defined(HAVE_CG)
+#if (defined(HAVE_GLSL) || defined(HAVE_CG))
 #define TEXTURES 8
 #else
 #define TEXTURES 1
@@ -213,7 +213,8 @@ typedef struct gl
    struct gl_coords coords;
 
    GLuint pbo;
-   GLenum texture_type; // XBGR1555 or ARGB
+   GLenum internal_fmt;
+   GLenum texture_type; // RGB565 or ARGB
    GLenum texture_fmt;
    GLenum border_type;
    unsigned base_size; // 2 or 4
@@ -237,9 +238,7 @@ typedef struct gl
    GLuint menu_texture_id;
 #endif
 
-#ifdef HAVE_EGL
    bool egl_images;
-#endif
 } gl_t;
 
 // Windows ... <_<
@@ -252,19 +251,25 @@ extern PFNGLACTIVETEXTUREPROC pglActiveTexture;
 #endif
 
 #if defined(HAVE_PSGL)
-#define RARCH_GL_INTERNAL_FORMAT GL_ARGB_SCE
-#define RARCH_GL_TEXTURE_TYPE GL_BGRA
+#define RARCH_GL_INTERNAL_FORMAT32 GL_ARGB_SCE
+#define RARCH_GL_INTERNAL_FORMAT16 GL_ARGB_SCE
+#define RARCH_GL_TEXTURE_TYPE32 GL_BGRA
+#define RARCH_GL_TEXTURE_TYPE16 GL_BGRA
 #define RARCH_GL_FORMAT32 GL_UNSIGNED_INT_8_8_8_8_REV
 #define RARCH_GL_FORMAT16 GL_RGB5_A1
 #elif defined(HAVE_OPENGLES)
-#define RARCH_GL_INTERNAL_FORMAT GL_BGRA_EXT
-#define RARCH_GL_TEXTURE_TYPE GL_BGRA_EXT
+#define RARCH_GL_INTERNAL_FORMAT32 GL_BGRA_EXT
+#define RARCH_GL_INTERNAL_FORMAT16 GL_RGB
+#define RARCH_GL_TEXTURE_TYPE32 GL_BGRA_EXT
+#define RARCH_GL_TEXTURE_TYPE16 GL_RGB
 #define RARCH_GL_FORMAT32 GL_UNSIGNED_BYTE
-// 15-bit is converted to 32-bit directly as we have to convert anyways.
-#define RARCH_GL_FORMAT16 GL_UNSIGNED_BYTE
+#define RARCH_GL_FORMAT16 GL_UNSIGNED_SHORT_5_6_5
 #else
-#define RARCH_GL_INTERNAL_FORMAT GL_RGBA
-#define RARCH_GL_TEXTURE_TYPE GL_BGRA
+// On desktop, we always use 32-bit.
+#define RARCH_GL_INTERNAL_FORMAT32 GL_RGBA
+#define RARCH_GL_INTERNAL_FORMAT16 GL_RGBA
+#define RARCH_GL_TEXTURE_TYPE32 GL_BGRA
+#define RARCH_GL_TEXTURE_TYPE16 GL_BGRA
 #define RARCH_GL_FORMAT32 GL_UNSIGNED_INT_8_8_8_8_REV
 #define RARCH_GL_FORMAT16 GL_UNSIGNED_INT_8_8_8_8_REV
 #endif
diff --git a/gfx/scaler/pixconv.c b/gfx/scaler/pixconv.c
index 33cfc1f00b..5318b9ef80 100644
--- a/gfx/scaler/pixconv.c
+++ b/gfx/scaler/pixconv.c
@@ -27,6 +27,64 @@
 #include <emmintrin.h>
 #endif
 
+#if defined(__SSE2__)
+void conv_0rgb1555_rgb565(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint16_t *input = (const uint16_t*)input_;
+   uint16_t *output = (uint16_t*)output_;
+
+   int max_width = width - 7;
+
+   const __m128i hi_mask   = _mm_set1_epi16((int16_t)((0x1f << 11) | (0x1f << 6)));
+   const __m128i lo_mask   = _mm_set1_epi16(0x1f);
+   const __m128i glow_mask = _mm_set1_epi16(1 << 5);
+
+   for (int h = 0; h < height; h++, output += out_stride >> 1, input += in_stride >> 1)
+   {
+      int w;
+      for (w = 0; w < max_width; w += 8)
+      {
+         const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
+         __m128i rg   = _mm_and_si128(_mm_slli_epi16(in, 1), hi_mask);
+         __m128i b    = _mm_and_si128(in, lo_mask);
+         __m128i glow = _mm_and_si128(_mm_srli_epi16(in, 4), glow_mask);
+         _mm_storeu_si128((__m128i*)(output + w), _mm_or_si128(rg, _mm_or_si128(b, glow)));
+      }
+
+      for (; w < width; w++)
+      {
+         uint16_t col = input[w];
+         uint16_t rg = (col << 1) & ((0x1f << 11) | (0x1f << 6));
+         uint16_t b = col & 0x1f;
+         uint16_t glow = (col >> 4) & (1 << 5);
+         output[w] = rg | b | glow;
+      }
+   }
+}
+#else
+void conv_0rgb1555_rgb565(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint16_t *input = (const uint16_t*)input_;
+   uint16_t *output = (uint16_t*)output_;
+
+   for (int h = 0; h < height; h++, output += out_stride >> 1, input += in_stride >> 1)
+   {
+      for (int w = 0; w < width; w++)
+      {
+         uint16_t col = input[w];
+         uint16_t rg = (col << 1) & ((0x1f << 11) | (0x1f << 6));
+         uint16_t b = col & 0x1f;
+         uint16_t glow = (col >> 4) & (1 << 5);
+         output[w] = rg | b | glow;
+      }
+   }
+}
+#endif
+
 #if defined(__SSE2__)
 void conv_0rgb1555_argb8888(void *output_, const void *input_,
       int width, int height,
@@ -109,6 +167,90 @@ void conv_0rgb1555_argb8888(void *output_, const void *input_,
 }
 #endif
 
+#if defined(__SSE2__)
+void conv_rgb565_argb8888(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint16_t *input = (const uint16_t*)input_;
+   uint32_t *output      = (uint32_t*)output_;
+
+   const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
+   const __m128i pix_mask_g = _mm_set1_epi16(0x3f <<  5);
+   const __m128i pix_mask_b = _mm_set1_epi16(0x1f <<  5);
+   const __m128i mul16_r    = _mm_set1_epi16(0x0210);
+   const __m128i mul16_g    = _mm_set1_epi16(0x2080);
+   const __m128i mul16_b    = _mm_set1_epi16(0x4200);
+   const __m128i a          = _mm_set1_epi16(0x00ff);
+
+   int max_width = width - 7;
+
+   for (int h = 0; h < height; h++, output += out_stride >> 2, input += in_stride >> 1)
+   {
+      int w;
+      for (w = 0; w < max_width; w += 8)
+      {
+         const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
+         __m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
+         __m128i g = _mm_and_si128(in, pix_mask_g);
+         __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
+
+         r = _mm_mulhi_epi16(r, mul16_r);
+         g = _mm_mulhi_epi16(g, mul16_g);
+         b = _mm_mulhi_epi16(b, mul16_b);
+
+         __m128i res_lo_bg = _mm_unpacklo_epi8(b, g);
+         __m128i res_hi_bg = _mm_unpackhi_epi8(b, g);
+         __m128i res_lo_ra = _mm_unpacklo_epi8(r, a);
+         __m128i res_hi_ra = _mm_unpackhi_epi8(r, a);
+
+         __m128i res_lo = _mm_or_si128(res_lo_bg, _mm_slli_si128(res_lo_ra, 2));
+         __m128i res_hi = _mm_or_si128(res_hi_bg, _mm_slli_si128(res_hi_ra, 2));
+
+         _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
+         _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
+      }
+
+      for (; w < width; w++)
+      {
+         uint32_t col = input[w];
+         uint32_t r = (col >> 11) & 0x1f;
+         uint32_t g = (col >>  5) & 0x3f;
+         uint32_t b = (col >>  0) & 0x1f;
+         r = (r << 3) | (r >> 2);
+         g = (g << 2) | (g >> 4);
+         b = (b << 3) | (b >> 2);
+
+         output[w] = (0xff << 24) | (r << 16) | (g << 8) | (b << 0);
+      }
+   }
+}
+#else
+void conv_rgb565_argb8888(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint16_t *input = (const uint16_t*)input_;
+   uint32_t *output      = (uint32_t*)output_;
+
+   for (int h = 0; h < height; h++, output += out_stride >> 2, input += in_stride >> 1)
+   {
+      for (int w = 0; w < width; w++)
+      {
+         uint32_t col = input[w];
+         uint32_t r = (col >> 11) & 0x1f;
+         uint32_t g = (col >>  5) & 0x3f;
+         uint32_t b = (col >>  0) & 0x1f;
+         r = (r << 3) | (r >> 2);
+         g = (g << 2) | (g >> 4);
+         b = (b << 3) | (b >> 2);
+
+         output[w] = (0xff << 24) | (r << 16) | (g << 8) | (b << 0);
+      }
+   }
+}
+#endif
+
 #if defined(__SSE2__)
 // :( TODO: Make this saner.
 static inline void store_bgr24_sse2(void *output, __m128i a, __m128i b, __m128i c, __m128i d)
@@ -223,6 +365,80 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
       }
    }
 }
+
+void conv_rgb565_bgr24(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint16_t *input = (const uint16_t*)input_;
+   uint8_t *output      = (uint8_t*)output_;
+
+   const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
+   const __m128i pix_mask_g = _mm_set1_epi16(0x3f <<  5);
+   const __m128i pix_mask_b = _mm_set1_epi16(0x1f <<  5);
+   const __m128i mul16_r    = _mm_set1_epi16(0x0210);
+   const __m128i mul16_g    = _mm_set1_epi16(0x2080);
+   const __m128i mul16_b    = _mm_set1_epi16(0x4200);
+   const __m128i a          = _mm_set1_epi16(0x00ff);
+
+   int max_width = width - 15;
+
+   for (int h = 0; h < height; h++, output += out_stride, input += in_stride >> 1)
+   {
+      uint8_t *out = output;
+
+      int w;
+      for (w = 0; w < max_width; w += 16, out += 48)
+      {
+         const __m128i in0 = _mm_loadu_si128((const __m128i*)(input + w));
+         const __m128i in1 = _mm_loadu_si128((const __m128i*)(input + w + 8));
+         __m128i r0 = _mm_and_si128(_mm_srli_epi16(in0, 1), pix_mask_r);
+         __m128i g0 = _mm_and_si128(in0, pix_mask_g);
+         __m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_b);
+         __m128i r1 = _mm_and_si128(_mm_srli_epi16(in1, 1), pix_mask_r);
+         __m128i g1 = _mm_and_si128(in1, pix_mask_g);
+         __m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b);
+
+         r0 = _mm_mulhi_epi16(r0, mul16_r);
+         g0 = _mm_mulhi_epi16(g0, mul16_g);
+         b0 = _mm_mulhi_epi16(b0, mul16_b);
+         r1 = _mm_mulhi_epi16(r1, mul16_r);
+         g1 = _mm_mulhi_epi16(g1, mul16_g);
+         b1 = _mm_mulhi_epi16(b1, mul16_b);
+
+         __m128i res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
+         __m128i res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
+         __m128i res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
+         __m128i res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
+         __m128i res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
+         __m128i res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
+         __m128i res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
+         __m128i res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
+
+         __m128i res_lo0 = _mm_or_si128(res_lo_bg0, _mm_slli_si128(res_lo_ra0, 2));
+         __m128i res_hi0 = _mm_or_si128(res_hi_bg0, _mm_slli_si128(res_hi_ra0, 2));
+         __m128i res_lo1 = _mm_or_si128(res_lo_bg1, _mm_slli_si128(res_lo_ra1, 2));
+         __m128i res_hi1 = _mm_or_si128(res_hi_bg1, _mm_slli_si128(res_hi_ra1, 2));
+
+         store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
+      }
+
+      for (; w < width; w++)
+      {
+         uint32_t col = input[w];
+         uint32_t r = (col >> 11) & 0x1f;
+         uint32_t g = (col >>  5) & 0x3f;
+         uint32_t b = (col >>  0) & 0x1f;
+         r = (r << 3) | (r >> 2);
+         g = (g << 2) | (g >> 4);
+         b = (b << 3) | (b >> 2);
+
+         *out++ = b;
+         *out++ = g;
+         *out++ = r;
+      }
+   }
+}
 #else
 void conv_0rgb1555_bgr24(void *output_, const void *input_,
       int width, int height,
@@ -250,6 +466,33 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
       }
    }
 }
+
+void conv_rgb565_bgr24(void *output_, const void *input_,
+      int width, int height,
+      int out_stride, int in_stride)
+{
+   const uint16_t *input = (const uint16_t*)input_;
+   uint8_t *output       = (uint8_t*)output_;
+
+   for (int h = 0; h < height; h++, output += out_stride, input += in_stride >> 1)
+   {
+      uint8_t *out = output;
+      for (int w = 0; w < width; w++)
+      {
+         uint32_t col = input[w];
+         uint32_t b = (col >>  0) & 0x1f;
+         uint32_t g = (col >>  5) & 0x3f;
+         uint32_t r = (col >> 11) & 0x1f;
+         b = (b << 3) | (b >> 2);
+         g = (g << 2) | (g >> 4);
+         r = (r << 3) | (r >> 2);
+
+         *out++ = b;
+         *out++ = g;
+         *out++ = r;
+      }
+   }
+}
 #endif
 
 void conv_bgr24_argb8888(void *output_, const void *input_,
diff --git a/gfx/scaler/pixconv.h b/gfx/scaler/pixconv.h
index d27608704d..9cdb7d9182 100644
--- a/gfx/scaler/pixconv.h
+++ b/gfx/scaler/pixconv.h
@@ -20,6 +20,14 @@ void conv_0rgb1555_argb8888(void *output, const void *input,
       int width, int height,
       int out_stride, int in_stride);
 
+void conv_0rgb1555_rgb565(void *output, const void *input,
+      int width, int height,
+      int out_stride, int in_stride);
+
+void conv_rgb565_argb8888(void *output, const void *input,
+      int width, int height,
+      int out_stride, int in_stride);
+
 void conv_bgr24_argb8888(void *output, const void *input,
       int width, int height,
       int out_stride, int in_stride);
@@ -28,6 +36,10 @@ void conv_argb8888_0rgb1555(void *output, const void *input,
       int width, int height,
       int out_stride, int in_stride);
 
+void conv_argb8888_rgb565(void *output, const void *input,
+      int width, int height,
+      int out_stride, int in_stride);
+
 void conv_argb8888_bgr24(void *output, const void *input,
       int width, int height,
       int out_stride, int in_stride);
@@ -36,6 +48,10 @@ void conv_0rgb1555_bgr24(void *output, const void *input,
       int width, int height,
       int out_stride, int in_stride);
 
+void conv_rgb565_bgr24(void *output, const void *input,
+      int width, int height,
+      int out_stride, int in_stride);
+
 void conv_copy(void *output, const void *input,
       int width, int height,
       int out_stride, int in_stride);
diff --git a/gfx/scaler/scaler.c b/gfx/scaler/scaler.c
index cdc5148593..0ea0f7ffc2 100644
--- a/gfx/scaler/scaler.c
+++ b/gfx/scaler/scaler.c
@@ -68,6 +68,12 @@ static bool set_direct_pix_conv(struct scaler_ctx *ctx)
       ctx->direct_pixconv = conv_copy;
    else if (ctx->in_fmt == SCALER_FMT_0RGB1555 && ctx->out_fmt == SCALER_FMT_ARGB8888)
       ctx->direct_pixconv = conv_0rgb1555_argb8888;
+   else if (ctx->in_fmt == SCALER_FMT_RGB565 && ctx->out_fmt == SCALER_FMT_ARGB8888)
+      ctx->direct_pixconv = conv_rgb565_argb8888;
+   else if (ctx->in_fmt == SCALER_FMT_RGB565 && ctx->out_fmt == SCALER_FMT_BGR24)
+      ctx->direct_pixconv = conv_rgb565_bgr24;
+   else if (ctx->in_fmt == SCALER_FMT_0RGB1555 && ctx->out_fmt == SCALER_FMT_RGB565)
+      ctx->direct_pixconv = conv_0rgb1555_rgb565;
    else if (ctx->in_fmt == SCALER_FMT_BGR24 && ctx->out_fmt == SCALER_FMT_ARGB8888)
       ctx->direct_pixconv = conv_bgr24_argb8888;
    else if (ctx->in_fmt == SCALER_FMT_ARGB8888 && ctx->out_fmt == SCALER_FMT_0RGB1555)
@@ -76,6 +82,8 @@ static bool set_direct_pix_conv(struct scaler_ctx *ctx)
       ctx->direct_pixconv = conv_argb8888_bgr24;
    else if (ctx->in_fmt == SCALER_FMT_0RGB1555 && ctx->out_fmt == SCALER_FMT_BGR24)
       ctx->direct_pixconv = conv_0rgb1555_bgr24;
+   else if (ctx->in_fmt == SCALER_FMT_RGB565 && ctx->out_fmt == SCALER_FMT_BGR24)
+      ctx->direct_pixconv = conv_rgb565_bgr24;
    else
       return false;
 
@@ -94,6 +102,10 @@ static bool set_pix_conv(struct scaler_ctx *ctx)
          ctx->in_pixconv = conv_0rgb1555_argb8888;
          break;
 
+      case SCALER_FMT_RGB565:
+         ctx->in_pixconv = conv_rgb565_argb8888;
+         break;
+
       case SCALER_FMT_BGR24:
          ctx->in_pixconv = conv_bgr24_argb8888;
          break;
@@ -160,14 +172,6 @@ bool scaler_ctx_gen_filter(struct scaler_ctx *ctx)
 
 void scaler_ctx_gen_reset(struct scaler_ctx *ctx)
 {
-#ifdef SCALER_PERF
-   if (ctx->elapsed_frames)
-      fprintf(stderr, "[Scaler]: ms / frame: %.3f\n", ctx->elapsed_time_ms / ctx->elapsed_frames);
-
-   ctx->elapsed_time_ms = 0.0;
-   ctx->elapsed_frames  = 0;
-#endif
-
    scaler_free(ctx->horiz.filter);
    scaler_free(ctx->horiz.filter_pos);
    scaler_free(ctx->vert.filter);
diff --git a/gfx/scaler/scaler.h b/gfx/scaler/scaler.h
index ffeca48a46..1b6f691e0f 100644
--- a/gfx/scaler/scaler.h
+++ b/gfx/scaler/scaler.h
@@ -26,6 +26,7 @@ enum scaler_pix_fmt
 {
    SCALER_FMT_ARGB8888 = 0,
    SCALER_FMT_0RGB1555,
+   SCALER_FMT_RGB565,
    SCALER_FMT_BGR24
 };
 
diff --git a/gfx/sdl_gfx.c b/gfx/sdl_gfx.c
index 37027751b4..88abe57cc5 100644
--- a/gfx/sdl_gfx.c
+++ b/gfx/sdl_gfx.c
@@ -38,9 +38,9 @@
 #include "SDL/SDL_syswm.h"
 #endif
 
-static void convert_15bit_15bit_direct(uint16_t *out, unsigned outpitch, const uint16_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt);
+static void convert_16bit_16bit_direct(uint16_t *out, unsigned outpitch, const uint16_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt);
 static void convert_32bit_32bit_direct(uint32_t *out, unsigned outpitch, const uint32_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt);
-static void convert_15bit_15bit_shift(uint16_t *out, unsigned outpitch, const uint16_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt);
+static void convert_16bit_16bit_shift(uint16_t *out, unsigned outpitch, const uint16_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt);
 static void convert_32bit_32bit_shift(uint32_t *out, unsigned outpitch, const uint32_t *input, unsigned width, unsigned height, unsigned pitch, const SDL_PixelFormat *fmt);
 
 typedef struct sdl_video
@@ -52,7 +52,7 @@ typedef struct sdl_video
 
    bool render32;
 
-   void (*convert_15_func)(uint16_t*, unsigned, const uint16_t*, unsigned, unsigned, unsigned, const SDL_PixelFormat*);
+   void (*convert_16_func)(uint16_t*, unsigned, const uint16_t*, unsigned, unsigned, unsigned, const SDL_PixelFormat*);
    void (*convert_32_func)(uint32_t*, unsigned, const uint32_t*, unsigned, unsigned, unsigned, const SDL_PixelFormat*);
 
 #ifdef HAVE_FREETYPE
@@ -111,11 +111,11 @@ static void sdl_init_font(sdl_video_t *vid, const char *font_path, unsigned font
          g = g < 0 ? 0 : (g > 255 ? 255 : g);
          b = b < 0 ? 0 : (b > 255 ? 255 : b);
 
-         // RGB888 -> RGB555
+         // RGB888 -> RGB565
          if (!vid->render32)
          {
             r >>= 3;
-            g >>= 3;
+            g >>= 2;
             b >>= 3;
          }
 
@@ -136,7 +136,7 @@ static void sdl_init_font(sdl_video_t *vid, const char *font_path, unsigned font
 }
 
 // Not very optimized, but hey :D
-static void sdl_render_msg_15(sdl_video_t *vid, SDL_Surface *buffer, const char *msg, unsigned width, unsigned height, const SDL_PixelFormat *fmt)
+static void sdl_render_msg_16(sdl_video_t *vid, SDL_Surface *buffer, const char *msg, unsigned width, unsigned height, const SDL_PixelFormat *fmt)
 {
 #ifdef HAVE_FREETYPE
    if (!vid->font)
@@ -197,7 +197,7 @@ static void sdl_render_msg_15(sdl_video_t *vid, SDL_Surface *buffer, const char
             unsigned blend = src[x];
             unsigned out_pix = out[x];
             unsigned r = (out_pix >> rshift) & 0x1f;
-            unsigned g = (out_pix >> gshift) & 0x1f;
+            unsigned g = (out_pix >> gshift) & 0x3f;
             unsigned b = (out_pix >> bshift) & 0x1f;
 
             unsigned out_r = (r * (256 - blend) + vid->font_r * blend) >> 8;
@@ -353,7 +353,7 @@ static void *sdl_gfx_init(const video_info_t *video, const input_driver_t **inpu
       RARCH_LOG("Creating window @ %ux%u\n", video->width, video->height);
 
    vid->render32 = !g_settings.video.force_16bit;
-   vid->screen = SDL_SetVideoMode(video->width, video->height, vid->render32 ? 32 : 15, SDL_HWSURFACE | SDL_HWACCEL | SDL_DOUBLEBUF | (video->fullscreen ? SDL_FULLSCREEN : 0));
+   vid->screen = SDL_SetVideoMode(video->width, video->height, vid->render32 ? 32 : 16, SDL_HWSURFACE | SDL_HWACCEL | SDL_DOUBLEBUF | (video->fullscreen ? SDL_FULLSCREEN : 0));
 
    if (!vid->screen)
    {
@@ -376,9 +376,9 @@ static void *sdl_gfx_init(const video_info_t *video, const input_driver_t **inpu
    }
    else
    {
-      RARCH_LOG("SDL: Creating 15-bit buffer.\n");
+      RARCH_LOG("SDL: Creating 16-bit buffer.\n");
       vid->buffer = SDL_CreateRGBSurface(SDL_SWSURFACE, RARCH_SCALE_BASE * video->input_scale,
-            RARCH_SCALE_BASE * video->input_scale, 15,
+            RARCH_SCALE_BASE * video->input_scale, 16,
             fmt->Rmask, fmt->Gmask, fmt->Bmask, fmt->Amask);
    }
    RARCH_LOG("[Debug]: SDL Pixel format: Rshift = %u, Gshift = %u, Bshift = %u\n",
@@ -408,15 +408,15 @@ static void *sdl_gfx_init(const video_info_t *video, const input_driver_t **inpu
 
    sdl_init_font(vid, g_settings.video.font_path, g_settings.video.font_size);
 
-   if (fmt->Rshift == 10 && fmt->Gshift ==  5 && fmt->Bshift == 0) // XRGB1555
+   if (fmt->Rshift == 11 && fmt->Gshift ==  5 && fmt->Bshift == 0) // RGB565
    {
-      RARCH_LOG("SDL: 15-bit format matches. Fast blit.\n");
-      vid->convert_15_func = convert_15bit_15bit_direct;
+      RARCH_LOG("SDL: 16-bit format matches. Fast blit.\n");
+      vid->convert_16_func = convert_16bit_16bit_direct;
    }
    else
    {
-      RARCH_LOG("SDL: 15-bit format does not match. Needs conversion.\n");
-      vid->convert_15_func = convert_15bit_15bit_shift;
+      RARCH_LOG("SDL: 16-bit format does not match. Needs conversion.\n");
+      vid->convert_16_func = convert_16bit_16bit_shift;
    }
 
    if (fmt->Rshift == 16 && fmt->Gshift == 8 && fmt->Bshift == 0) // ARGB8888
@@ -431,7 +431,7 @@ static void *sdl_gfx_init(const video_info_t *video, const input_driver_t **inpu
    }
 
    vid->scaler.scaler_type = video->smooth ? SCALER_TYPE_BILINEAR : SCALER_TYPE_POINT;
-   vid->scaler.in_fmt  = vid->render32 ? SCALER_FMT_ARGB8888 : SCALER_FMT_0RGB1555;
+   vid->scaler.in_fmt  = vid->render32 ? SCALER_FMT_ARGB8888 : SCALER_FMT_RGB565;
    vid->scaler.out_fmt = vid->scaler.in_fmt;
 
    return vid;
@@ -441,56 +441,56 @@ error:
    return NULL;
 }
 
-static inline uint16_t conv_pixel_32_15(uint32_t pix, const SDL_PixelFormat *fmt)
+static inline uint16_t conv_pixel_32_16(uint32_t pix, const SDL_PixelFormat *fmt)
 {
-   uint16_t r = ((pix & 0x00f80000) >> 19) << fmt->Rshift;
-   uint16_t g = ((pix & 0x0000f800) >> 11) << fmt->Gshift;
+   uint16_t r = ((pix & 0x00f80000) >> 18) << fmt->Rshift;
+   uint16_t g = ((pix & 0x0000fc00) >> 10) << fmt->Gshift;
    uint16_t b = ((pix & 0x000000f8) >>  3) << fmt->Bshift;
    return r | g | b;
 }
 
-static inline uint32_t conv_pixel_15_32(uint16_t pix, const SDL_PixelFormat *fmt)
+static inline uint32_t conv_pixel_16_32(uint16_t pix, const SDL_PixelFormat *fmt)
 {
-   uint32_t r = (pix >> 10) & 0x1f;
-   uint32_t g = (pix >>  5) & 0x1f;
+   uint32_t r = (pix >> 11) & 0x1f;
+   uint32_t g = (pix >>  5) & 0x3f;
    uint32_t b = (pix >>  0) & 0x1f;
 
    r = (r << 3) | (r >> 2);
-   g = (g << 3) | (g >> 2);
+   g = (g << 2) | (g >> 4);
    b = (b << 3) | (b >> 2);
 
    return (r << fmt->Rshift) | (g << fmt->Gshift) | (b << fmt->Bshift);
 }
 
-static void convert_32bit_15bit(uint16_t *out, unsigned outpitch,
+static void convert_32bit_16bit(uint16_t *out, unsigned outpitch,
       const uint32_t *input, unsigned width, unsigned height,
       unsigned pitch, const SDL_PixelFormat *fmt)
 {
    for (unsigned y = 0; y < height; y++)
    {
       for (unsigned x = 0; x < width; x++)
-         out[x] = conv_pixel_32_15(input[x], fmt);
+         out[x] = conv_pixel_32_16(input[x], fmt);
 
       out += outpitch >> 1;
       input += pitch >> 2;
    }
 }
 
-static void convert_15bit_32bit(uint32_t *out, unsigned outpitch,
+static void convert_16bit_32bit(uint32_t *out, unsigned outpitch,
       const uint16_t *input, unsigned width, unsigned height,
       unsigned pitch, const SDL_PixelFormat *fmt)
 {
    for (unsigned y = 0; y < height; y++)
    {
       for (unsigned x = 0; x < width; x++)
-         out[x] = conv_pixel_15_32(input[x], fmt);
+         out[x] = conv_pixel_16_32(input[x], fmt);
 
       out += outpitch >> 2;
       input += pitch >> 1;
    }
 }
 
-static void convert_15bit_15bit_direct(uint16_t *out, unsigned outpitch,
+static void convert_16bit_16bit_direct(uint16_t *out, unsigned outpitch,
       const uint16_t *input, unsigned width, unsigned height,
       unsigned pitch, const SDL_PixelFormat *fmt)
 {
@@ -516,7 +516,7 @@ static void convert_32bit_32bit_direct(uint32_t *out, unsigned outpitch,
    (void)fmt;
 }
 
-static void convert_15bit_15bit_shift(uint16_t *out, unsigned outpitch,
+static void convert_16bit_16bit_shift(uint16_t *out, unsigned outpitch,
       const uint16_t *input, unsigned width, unsigned height,
       unsigned pitch, const SDL_PixelFormat *fmt)
 {
@@ -528,8 +528,8 @@ static void convert_15bit_15bit_shift(uint16_t *out, unsigned outpitch,
       for (unsigned x = 0; x < width; x++)
       {
          uint16_t color = src[x];
-         uint16_t r = ((color >> 10) & 0x1f) << fmt->Rshift;
-         uint16_t g = ((color >>  5) & 0x1f) << fmt->Gshift;
+         uint16_t r = ((color >> 11) & 0x1f) << fmt->Rshift;
+         uint16_t g = ((color >>  5) & 0x3f) << fmt->Gshift;
          uint16_t b = ((color >>  0) & 0x1f) << fmt->Bshift;
          dest[x] = r | g | b;
       }
@@ -585,13 +585,13 @@ static bool sdl_gfx_frame(void *data, const void *frame, unsigned width, unsigne
 
    // 15-bit -> 32-bit.
    if (vid->upsample)
-      convert_15bit_32bit((uint32_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint16_t*)frame, width, height, pitch, vid->screen->format);
+      convert_16bit_32bit((uint32_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint16_t*)frame, width, height, pitch, vid->screen->format);
    // 15-bit -> 15-bit
    else if (!vid->rgb32)
-      vid->convert_15_func((uint16_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint16_t*)frame, width, height, pitch, vid->screen->format);
+      vid->convert_16_func((uint16_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint16_t*)frame, width, height, pitch, vid->screen->format);
    // 32-bit -> 15-bit
    else if (vid->rgb32 && !vid->render32)
-      convert_32bit_15bit((uint16_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint32_t*)frame, width, height, pitch, vid->screen->format);
+      convert_32bit_16bit((uint16_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint32_t*)frame, width, height, pitch, vid->screen->format);
    // 32-bit -> 32-bit
    else
       vid->convert_32_func((uint32_t*)vid->buffer->pixels, vid->buffer->pitch, (const uint32_t*)frame, width, height, pitch, vid->screen->format);
@@ -627,7 +627,7 @@ static bool sdl_gfx_frame(void *data, const void *frame, unsigned width, unsigne
       if (vid->render32)
          sdl_render_msg_32(vid, vid->screen, msg, vid->screen->w, vid->screen->h, vid->screen->format);
       else
-         sdl_render_msg_15(vid, vid->screen, msg, vid->screen->w, vid->screen->h, vid->screen->format);
+         sdl_render_msg_16(vid, vid->screen, msg, vid->screen->w, vid->screen->h, vid->screen->format);
    }
 
    char buf[128];
diff --git a/gfx/shader_cg.c b/gfx/shader_cg.c
index 5018cbcaac..1c0272050f 100644
--- a/gfx/shader_cg.c
+++ b/gfx/shader_cg.c
@@ -521,8 +521,8 @@ static void load_texture_data(GLuint *obj, const struct texture_image *img, bool
    glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
 #endif
    glTexImage2D(GL_TEXTURE_2D,
-         0, RARCH_GL_INTERNAL_FORMAT, img->width, img->height,
-         0, RARCH_GL_TEXTURE_TYPE, RARCH_GL_FORMAT32, img->pixels);
+         0, RARCH_GL_INTERNAL_FORMAT32, img->width, img->height,
+         0, RARCH_GL_TEXTURE_TYPE32, RARCH_GL_FORMAT32, img->pixels);
 
    free(img->pixels);
 }
diff --git a/gfx/shader_glsl.c b/gfx/shader_glsl.c
index 4b2e2f87c5..619c182ceb 100644
--- a/gfx/shader_glsl.c
+++ b/gfx/shader_glsl.c
@@ -465,8 +465,8 @@ static bool get_texture_image(const char *shader_path, xmlNodePtr ptr)
 
    glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
    glTexImage2D(GL_TEXTURE_2D,
-         0, RARCH_GL_INTERNAL_FORMAT,
-         img.width, img.height, 0, RARCH_GL_TEXTURE_TYPE, RARCH_GL_FORMAT32, img.pixels);
+         0, RARCH_GL_INTERNAL_FORMAT32,
+         img.width, img.height, 0, RARCH_GL_TEXTURE_TYPE32, RARCH_GL_FORMAT32, img.pixels);
 
    pglActiveTexture(GL_TEXTURE0);
    glBindTexture(GL_TEXTURE_2D, 0);
diff --git a/gfx/xvideo.c b/gfx/xvideo.c
index 724bc738e1..252c716671 100644
--- a/gfx/xvideo.c
+++ b/gfx/xvideo.c
@@ -108,16 +108,16 @@ static inline void calculate_yuv(uint8_t *y, uint8_t *u, uint8_t *v, unsigned r,
 
 static void init_yuv_tables(xv_t *xv)
 {
-   xv->ytable = (uint8_t*)malloc(0x8000);
-   xv->utable = (uint8_t*)malloc(0x8000);
-   xv->vtable = (uint8_t*)malloc(0x8000);
+   xv->ytable = (uint8_t*)malloc(0x10000);
+   xv->utable = (uint8_t*)malloc(0x10000);
+   xv->vtable = (uint8_t*)malloc(0x10000);
 
-   for (unsigned i = 0; i < 0x8000; i++)
+   for (unsigned i = 0; i < 0x10000; i++)
    {
-      // Extract RGB555 color data from i
-      unsigned r = (i >> 10) & 0x1F, g = (i >> 5) & 0x1F, b = (i) & 0x1F;
+      // Extract RGB565 color data from i
+      unsigned r = (i >> 11) & 0x1f, g = (i >> 5) & 0x3f, b = (i >> 0) & 0x1f;
       r = (r << 3) | (r >> 2);  // R5->R8
-      g = (g << 3) | (g >> 2);  // G5->G8
+      g = (g << 2) | (g >> 4);  // G6->G8
       b = (b << 3) | (b >> 2);  // B5->B8
 
       calculate_yuv(&xv->ytable[i], &xv->utable[i], &xv->vtable[i], r, g, b);
@@ -224,7 +224,7 @@ static void render32_yuy2(xv_t *xv, const void *input_, unsigned width, unsigned
       for (unsigned x = 0; x < width; x++)
       {
          uint32_t p = *input++;
-         p = ((p >> 9) & 0x7c00) | ((p >> 6) & 0x03e0) | ((p >> 3) & 0x1f); // ARGB -> RGB15
+         p = ((p >> 8) & 0xf800) | ((p >> 5) & 0x07e0) | ((p >> 3) & 0x1f); // ARGB -> RGB16
 
          uint8_t y0 = xv->ytable[p];
          uint8_t u = xv->utable[p];
@@ -253,7 +253,7 @@ static void render32_uyvy(xv_t *xv, const void *input_, unsigned width, unsigned
       for (unsigned x = 0; x < width; x++)
       {
          uint32_t p = *input++;
-         p = ((p >> 9) & 0x7c00) | ((p >> 6) & 0x03e0) | ((p >> 3) & 0x1f); // ARGB -> RGB15
+         p = ((p >> 8) & 0xf800) | ((p >> 5) & 0x07e0) | ((p >> 3) & 0x1f); // ARGB -> RGB16
 
          uint8_t y0 = xv->ytable[p];
          uint8_t u = xv->utable[p];
diff --git a/libretro-test/libretro-test.c b/libretro-test/libretro-test.c
index 9c035c8be3..f589aca059 100644
--- a/libretro-test/libretro-test.c
+++ b/libretro-test/libretro-test.c
@@ -146,8 +146,8 @@ static void update_input(void)
 
 static void render_checkered(void)
 {
-   uint16_t color_r = 31 << 10;
-   uint16_t color_g = 31 <<  5;
+   uint16_t color_r = 31 << 11;
+   uint16_t color_g = 63 <<  5;
 
    uint16_t *line = frame_buf;
    for (unsigned y = 0; y < 240; y++, line += 320)
@@ -193,6 +193,13 @@ bool retro_load_game(const struct retro_game_info *info)
 
    environ_cb(RETRO_ENVIRONMENT_SET_INPUT_DESCRIPTORS, desc);
 
+   enum retro_pixel_format fmt = RETRO_PIXEL_FORMAT_RGB565;
+   if (!environ_cb(RETRO_ENVIRONMENT_SET_PIXEL_FORMAT, &fmt))
+   {
+      fprintf(stderr, "RGB565 is not supported.\n");
+      return false;
+   }
+
    (void)info;
    return true;
 }
diff --git a/libretro.h b/libretro.h
index c2b193c4a7..57d12a4606 100755
--- a/libretro.h
+++ b/libretro.h
@@ -355,6 +355,7 @@ enum retro_key
                                            // const enum retro_pixel_format * --
                                            // Sets the internal pixel format used by the implementation.
                                            // The default pixel format is RETRO_PIXEL_FORMAT_0RGB1555.
+                                           // This pixel format however, is deprecated (see enum retro_pixel_format).
                                            // If the call returns false, the frontend does not support this pixel format.
                                            // This function should be called inside retro_load_game() or retro_get_system_av_info().
                                            //
@@ -368,8 +369,18 @@ enum retro_key
 
 enum retro_pixel_format
 {
-   RETRO_PIXEL_FORMAT_0RGB1555 = 0, // 0RGB1555, native endian. 0 bit must be set to 0.
-   RETRO_PIXEL_FORMAT_XRGB8888      // XRGB8888, native endian. X bits are ignored.
+   // 0RGB1555, native endian. 0 bit must be set to 0.
+   // This pixel format is default for compatibility concerns only.
+   // If a 15/16-bit pixel format is desired, consider using RGB565.
+   RETRO_PIXEL_FORMAT_0RGB1555 = 0,
+
+   // XRGB8888, native endian. X bits are ignored.
+   RETRO_PIXEL_FORMAT_XRGB8888 = 1,
+
+   // RGB565, native endian. This pixel format is the recommended format to use if a 15/16-bit format is desired
+   // as it is the pixel format that is typically available on a wide range of low-power devices.
+   // It is also natively supported in APIs like OpenGL ES.
+   RETRO_PIXEL_FORMAT_RGB565   = 2
 };
 
 struct retro_message
@@ -465,6 +476,8 @@ typedef bool (*retro_environment_t)(unsigned cmd, void *data);
 // Render a frame. Pixel format is 15-bit 0RGB1555 native endian unless changed (see RETRO_ENVIRONMENT_SET_PIXEL_FORMAT).
 // Width and height specify dimensions of buffer.
 // Pitch specifices length in bytes between two lines in buffer.
+// For performance reasons, it is highly recommended to have a frame that is packed in memory, i.e. pitch == width * byte_per_pixel.
+// Certain graphic APIs, such as OpenGL ES, do not like textures that are not packed in memory.
 typedef void (*retro_video_refresh_t)(const void *data, unsigned width, unsigned height, size_t pitch);
 
 // Renders a single audio frame. Should only be used if implementation generates a single sample at a time.
diff --git a/record/ffemu.c b/record/ffemu.c
index 36f4b6968c..3e9477e3b7 100644
--- a/record/ffemu.c
+++ b/record/ffemu.c
@@ -175,8 +175,8 @@ static bool ffemu_init_video(struct ff_video_info *video, const struct ffemu_par
 
    switch (param->pix_fmt)
    {
-      case FFEMU_PIX_XRGB1555:
-         video->scaler.in_fmt = SCALER_FMT_0RGB1555;
+      case FFEMU_PIX_RGB565:
+         video->scaler.in_fmt = SCALER_FMT_RGB565;
          video->pix_size = 2;
          break;
 
diff --git a/record/ffemu.h b/record/ffemu.h
index f5fe0d9271..aff204c373 100644
--- a/record/ffemu.h
+++ b/record/ffemu.h
@@ -25,7 +25,7 @@ extern "C" {
 
 enum ffemu_pix_format
 {
-   FFEMU_PIX_XRGB1555 = 0,
+   FFEMU_PIX_RGB565 = 0,
    FFEMU_PIX_BGR24,
    FFEMU_PIX_ARGB8888
 };
diff --git a/retroarch.c b/retroarch.c
index 01bc2628b3..0a804e3754 100644
--- a/retroarch.c
+++ b/retroarch.c
@@ -24,6 +24,7 @@
 #include "file.h"
 #include "general.h"
 #include "dynamic.h"
+#include "benchmark.h"
 #include "audio/utils.h"
 #include "record/ffemu.h"
 #include "rewind.h"
@@ -252,6 +253,24 @@ static void video_frame(const void *data, unsigned width, unsigned height, size_
       return;
 #endif
 
+   if (g_extern.system.pix_fmt == RETRO_PIXEL_FORMAT_0RGB1555 && data)
+   {
+      RARCH_PERFORMANCE_INIT(video_frame_conv);
+      RARCH_PERFORMANCE_START(video_frame_conv);
+      driver.scaler.in_width = width;
+      driver.scaler.in_height = height;
+      driver.scaler.out_width = width;
+      driver.scaler.out_height = height;
+      driver.scaler.in_stride = pitch;
+      driver.scaler.out_stride = width * sizeof(uint16_t);
+
+      scaler_ctx_scale(&driver.scaler, driver.scaler_out, data);
+      data = driver.scaler_out;
+      pitch = driver.scaler.out_stride;
+      RARCH_PERFORMANCE_STOP(video_frame_conv);
+      RARCH_PERFORMANCE_LOG("video_frame_conv()", video_frame_conv);
+   }
+
    // Slightly messy code,
    // but we really need to do processing before blocking on VSync for best possible scheduling.
 #ifdef HAVE_FFMPEG
@@ -1228,7 +1247,7 @@ static void init_recording(void)
    params.filename   = g_extern.record_path;
    params.fps        = fps;
    params.samplerate = samplerate;
-   params.pix_fmt    = g_extern.system.rgb32 ? FFEMU_PIX_ARGB8888 : FFEMU_PIX_XRGB1555;
+   params.pix_fmt    = g_extern.system.pix_fmt == RETRO_PIXEL_FORMAT_XRGB8888 ? FFEMU_PIX_ARGB8888 : FFEMU_PIX_RGB565;
 
    if (g_settings.video.gpu_record && driver.video->read_viewport)
    {
diff --git a/screenshot.c b/screenshot.c
index beb56e4811..be5cca4010 100644
--- a/screenshot.c
+++ b/screenshot.c
@@ -142,22 +142,35 @@ static void dump_line_16(uint8_t *line, const uint16_t *src, unsigned width)
    {
       uint16_t pixel = *src++;
       uint8_t b = (pixel >>  0) & 0x1f;
-      uint8_t g = (pixel >>  5) & 0x1f;
-      uint8_t r = (pixel >> 10) & 0x1f;
+      uint8_t g = (pixel >>  5) & 0x3f;
+      uint8_t r = (pixel >> 11) & 0x1f;
       *line++   = (b << 3) | (b >> 2);
-      *line++   = (g << 3) | (g >> 2);
+      *line++   = (g << 2) | (g >> 4);
       *line++   = (r << 3) | (r >> 2);
    }
 }
 
+static void dump_line_32(uint8_t *line, const uint32_t *src, unsigned width)
+{
+   for (unsigned i = 0; i < width; i++)
+   {
+      uint32_t pixel = *src++;
+      *line++ = (pixel >>  0) & 0xff;
+      *line++ = (pixel >>  8) & 0xff;
+      *line++ = (pixel >> 16) & 0xff;
+   }
+}
+
 static void dump_content(FILE *file, const void *frame,
       int width, int height, int pitch, bool bgr24)
 {
-   const uint8_t  *frame_bgr = (const uint8_t*)frame;
-   const uint16_t *frame16   = (const uint16_t*)frame;
-
-   if (!bgr24)
-      pitch /= sizeof(uint16_t);
+   union
+   {
+      const uint8_t *u8;
+      const uint16_t *u16;
+      const uint32_t *u32;
+   } u;
+   u.u8 = (const uint8_t*)frame;
 
    uint8_t **lines = (uint8_t**)calloc(height, sizeof(uint8_t*));
    if (!lines)
@@ -174,13 +187,18 @@ static void dump_content(FILE *file, const void *frame,
 
    if (bgr24) // BGR24 byte order. Can directly copy.
    {
-      for (int j = 0; j < height; j++, frame_bgr += pitch)
-         dump_line_bgr(lines[j], frame_bgr, width);
+      for (int j = 0; j < height; j++, u.u8 += pitch)
+         dump_line_bgr(lines[j], u.u8, width);
    }
-   else // ARGB1555
+   else if (g_extern.system.pix_fmt == RETRO_PIXEL_FORMAT_XRGB8888)
    {
-      for (int j = 0; j < height; j++, frame16 += pitch)
-         dump_line_16(lines[j], frame16, width);
+      for (int j = 0; j < height; j++, u.u8 += pitch)
+         dump_line_32(lines[j], u.u32, width);
+   }
+   else // RGB565
+   {
+      for (int j = 0; j < height; j++, u.u8 += pitch)
+         dump_line_16(lines[j], u.u16, width);
    }
 
 #ifdef HAVE_LIBPNG