diff --git a/SConstruct b/SConstruct
index 08b92b90..d5c0da2e 100644
--- a/SConstruct
+++ b/SConstruct
@@ -9,7 +9,9 @@ opts.AddOptions(
   BoolOption('LSB_FIRST', 'Least signficant byte first (non-PPC)', 1),
   BoolOption('DEBUG',     'Build with debugging symbols', 0),
   BoolOption('LUA',       'Enable Lua support', 1),
-  BoolOption('NEWPPU',    'Enable new PPU core', 0)
+  BoolOption('NEWPPU',    'Enable new PPU core', 0),
+  BoolOption('CREATE_AVI', 'Enable avi creation support (SDL only)', 0),
+  BoolOption('LOGO', 'Enable a logoscreen when creating avis (SDL only)', '1')
 )
 
 env = Environment(options = opts)
@@ -86,7 +88,14 @@ else:
       print "*** WARNING ***"
       print "Zenity could not be found in the PATH.  File dialogs will not work without zenity installed."
       raw_input('Press any key to continue. . .')
-    
+  
+  ### Search for gd if we're not in Windows
+  if env['PLATFORM'] != 'win32' and env['PLATFORM'] != 'cygwin' and env['CREATE_AVI'] and env['LOGO']:
+    gd = conf.CheckLib('gd', autoadd=1)
+    if gd == 0:
+      env['LOGO'] = 0
+      print 'Did not find libgd, you won\'t be able to create a logo screen for your avis.'
+   
   if conf.CheckFunc('asprintf'):
     conf.env.Append(CCFLAGS = " -DHAVE_ASPRINTF")
   if env['OPENGL'] and conf.CheckLibWithHeader('GL', 'GL/gl.h', 'c++', autoadd=1):
@@ -115,6 +124,11 @@ print "base CCFLAGS:",env['CCFLAGS']
 if env['DEBUG']:
   env.Append(CPPDEFINES=["_DEBUG"], CCFLAGS = ['-g'])
 
+if env['PLATFORM'] != 'win32' and env['PLATFORM'] != 'cygwin' and env['CREATE_AVI']:
+  env.Append(CPPDEFINES=["CREATE_AVI"])
+ else
+  env['CREATE_AVI']=0;
+
 Export('env')
 SConscript('src/SConscript')
 
diff --git a/changelog.txt b/changelog.txt
index ac189cfc..21e47f7d 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,4 +1,5 @@
 ---version 2.0.3 yet to be released---
+25-oct-2008 - shinydoofy - added support for AVI creation for SDL, see documention/Videolog.txt for more
 19-oct-2008 - shinydoofy - toggle lag frame counter for SDL, default hotkey F8
 19-oct-2008 - shinydoofy - toggle skipping of lag frames for SDL, default hotkey F6
 19-oct-2008 - shinydoofy - [ 2179829 ] user ability to toggle "bind savestates to movie" added for SDL, default hotkey F2
diff --git a/documentation/Videolog.txt b/documentation/Videolog.txt
new file mode 100644
index 00000000..04278638
--- /dev/null
+++ b/documentation/Videolog.txt
@@ -0,0 +1,45 @@
+Since SVN revision 931, FCEUX features a new option to create avi files from a recorded movie and it is relatively easy to use if you know the bare basics of mencoder.
+Call "scons CREATE_AVI=1" to activate it. You will, however, most likely need mencoder to use it.
+
+You get the raw video data via stdin and the audio data from a fifo file. Let's say you want the video to be in the best quality available, no matter how long it takes or how big the avi file might get. In order to get the NES's original video resolution and a good sound quality, you might need to set some settings beforehand or just pass them along while calling mencoder.
+
+
+Here's an example:
+./fceux \
+  --xscale 1 --yscale 1 --special 0 \
+  --pal 0 \
+  --sound 1 --soundq 1 --soundrate 48000 \
+  --nospritelim 1 \
+  --videolog "mencoder - -o myfirstencodedrun.avi \
+    -ovc x264 -x264encopts qp=0 \
+    -oac pcm \
+    -noskip -nocache -mc 0 -aspect 4/3
+    NESVSETTINGS" \
+  --playmov mymovie.fm2 myROM.nes
+
+Now let's see what is done and why we did it:
+First of all, we started fceux with "./fceux" and gave it some options:
+ --xscale and --yscale determine how much bigger the video in comparison to its regular size. It's no point to use anything other than 1 here because you can always see your video on fullscreen or at least scale it, can't you? As a nice addon, it saves time to create the avi file and also saves valuable space on your hard disk.
+ --special would usually do something fancy to your picture when you're playing a ROM, but again, it's mostly pointless to use for an avi.
+ --pal 0 lets the game run at ~60Hz. Set this so 1 if you are using a PAL region ROM.
+ --sound 1 activates sound.
+ --soundq 1 activates high quality sound.
+ --soundrate 48000 sets the sound at 48kHz.
+ --nospritelim deactivates the NES's 8 sprites per scanlines limit.
+ --videolog calls mencoder:
+  - states we're getting the video stream from stdin.
+  -o determines the name of the produced avi file.
+  -ovc x264 set the video codec to be x264 and is highly recommended for quality reasons.
+  -x264encopts qp=0 tells the x264 codec to use a quantizer of 0, which results in lossless video quality.
+  -oac pcm saves the audio data uncompressed (watch out, this might turn out really big).
+  -noskip makes sure that no frame is dropped.
+  -nocache is responsible for immediate encoding and not using any cache.
+  -mc 0 makes sure that the sound does not go out of sync.
+  -aspect 4/3 sets the avi's aspect ratio so you can see it in fullscreen and have no borders to the left and right.
+  NESVSETTINGS takes care of proper recognition of the audio and video data from FCEUX.
+  &> mencoder.log lets mencoder's output log into a file called mencoder.log in your current working directory.
+ --playmov reads which movie file we want to load (here it's mymovie.fm2) and which ROM to use for it (myROM.ns).
+
+To go for faster encoding and thus less quality, change "-ovc x264 -x264encopts qp=0" to "-ovc xvid -xvidencopts bitrate=200" and "-oac pcm" to "-oac mp3lame -lameopts mode=3:preset=60" to create a 200 kbps xvid video with 60 kbps of mono mp3 audio.
+
+One last reminder: setting all these options for FCEUX of course changes the settings you've set before (like sound quality or whether or not to scale the video image). So be sure to backup your config file first (in ~/.fceux/) if you don't want set it all up again after encoding.
diff --git a/src/SConscript b/src/SConscript
index e783008c..6c088c98 100644
--- a/src/SConscript
+++ b/src/SConscript
@@ -32,7 +32,8 @@ drivers/common
 fir
 input
 utils
-mappers""")
+mappers
+""")
 #palettes
 
 Import('env')
@@ -41,6 +42,11 @@ Export('env')
 if env['LUA']:
   file_list.append('lua-engine.cpp')
 
+if env['CREATE_AVI']:
+  subdirs.append('drivers/videolog')
+  
+
+
 for dir in subdirs:
   subdir_files = SConscript('%s/SConscript' % dir)
   file_list.append(subdir_files)
diff --git a/src/drivers/common/vidblit.cpp b/src/drivers/common/vidblit.cpp
index 08abc305..064aa491 100644
--- a/src/drivers/common/vidblit.cpp
+++ b/src/drivers/common/vidblit.cpp
@@ -284,7 +284,7 @@ void SetPaletteBlitToHigh(uint8 *src)
  }
 }
 
-static void Blit32to24(uint32 *src, uint8 *dest, int xr, int yr, int dpitch)
+void Blit32to24(uint32 *src, uint8 *dest, int xr, int yr, int dpitch)
 {
  int x,y;
 
@@ -306,7 +306,7 @@ static void Blit32to24(uint32 *src, uint8 *dest, int xr, int yr, int dpitch)
 }
 
 
-static void Blit32to16(uint32 *src, uint16 *dest, int xr, int yr, int dpitch,
+void Blit32to16(uint32 *src, uint16 *dest, int xr, int yr, int dpitch,
         int shiftr[3], int shiftl[3])
 {
  int x,y;
diff --git a/src/drivers/common/vidblit.h b/src/drivers/common/vidblit.h
index 1fc05a5b..cf957e32 100644
--- a/src/drivers/common/vidblit.h
+++ b/src/drivers/common/vidblit.h
@@ -23,3 +23,7 @@ void SetPaletteBlitToHigh(uint8 *src);
 void KillBlitToHigh(void);
 void Blit8ToHigh(uint8 *src, uint8 *dest, int xr, int yr, int pitch, int xscale, int yscale);
 void Blit8To8(uint8 *src, uint8 *dest, int xr, int yr, int pitch, int xscale, int yscale, int efx, int special);
+
+void Blit32to24(uint32 *src, uint8 *dest, int xr, int yr, int dpitch);
+void Blit32to16(uint32 *src, uint16 *dest, int xr, int yr, int dpitch,
+        int shiftr[3], int shiftl[3]);
diff --git a/src/drivers/sdl/config.cpp b/src/drivers/sdl/config.cpp
index 6b0e952a..4b1e2689 100644
--- a/src/drivers/sdl/config.cpp
+++ b/src/drivers/sdl/config.cpp
@@ -185,6 +185,10 @@ InitConfig()
     // load lua script
     config->addOption("loadlua", "SDL.LuaScript", "");
     #endif
+    
+    #ifdef CREATE_AVI
+    config->addOption("videolog",  "SDL.VideoLog",  "");
+    #endif    
 	
 	// enable new PPU core
 	config->addOption("newppu", "SDL.NewPPU", "0");
diff --git a/src/drivers/sdl/input.cpp b/src/drivers/sdl/input.cpp
index 357f050e..8a6db7de 100644
--- a/src/drivers/sdl/input.cpp
+++ b/src/drivers/sdl/input.cpp
@@ -253,7 +253,7 @@ KeyboardCommands()
 
     // Toggle throttling
     NoWaiting &= ~1;
-    if(KEY(GRAVE)) {
+    if(KEY(TAB)) {
         NoWaiting |= 1;
     }
 
diff --git a/src/drivers/sdl/sdl-throttle.cpp b/src/drivers/sdl/sdl-throttle.cpp
index dcc061db..90913753 100644
--- a/src/drivers/sdl/sdl-throttle.cpp
+++ b/src/drivers/sdl/sdl-throttle.cpp
@@ -4,14 +4,25 @@
 #include "sdl.h"
 #include "throttle.h"
 
-static uint64 s_tfreq;
-static uint64 s_desiredfps;
+static const double Slowest = 0.015625; // 1/64x speed (around 1 fps on NTSC)
+static const double Fastest = 32;       // 32x speed   (around 1920 fps on NTSC)
+static const double Normal  = 1.0;      // 1x speed    (around 60 fps on NTSC)
 
-static int32 s_fpsScaleTable[]=
-{ 3, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
-int32 g_fpsScale = 256;
+static uint64 Lasttime, Nexttime;
+static long double desired_frametime;
+static int InFrame;
+double g_fpsScale = Normal; // used by sdl.cpp
+bool MaxSpeed = false;
 
-#define FPS_TABLE_SIZE (sizeof(s_fpsScaleTable) / sizeof(s_fpsScaleTable[0]))
+/* LOGMUL = exp(log(2) / 3)
+ *
+ * This gives us a value such that if we do x*=LOGMUL three times,
+ * then after that, x is twice the value it was before.
+ *
+ * This gives us three speed steps per order of magnitude.
+ *
+ */
+#define LOGMUL 1.259921049894873
 
 /**
  * Refreshes the FPS throttling variables.
@@ -19,87 +30,89 @@ int32 g_fpsScale = 256;
 void
 RefreshThrottleFPS()
 {
-    s_desiredfps = FCEUI_GetDesiredFPS() >> 8;
-    s_desiredfps = (s_desiredfps * g_fpsScale) >> 8;
-    s_tfreq = 10000000;
-    s_tfreq <<= 16; /* Adjust for fps returned from FCEUI_GetDesiredFPS(). */
+    uint64 fps = FCEUI_GetDesiredFPS(); // Do >> 24 to get in Hz
+    desired_frametime = 16777216.0l / (fps * g_fpsScale);
+
+    Lasttime=0;   
+    Nexttime=0;
+    InFrame=0;
 }
 
 /**
  * Perform FPS speed throttling by delaying until the next time slot.
  */
-void
+int
 SpeedThrottle()
 {
-    bool doDelay;
-
-    // XXX soules - go back through and get rid of static function variables
-    static uint64 ttime,ltime=0;
-  
-    // loop until we've delayed enough
-    do {
-        doDelay = false;
-
-        // check the current time
-        ttime = SDL_GetTicks();
-        ttime *= 10000;
-
-        if((ttime - ltime) < (s_tfreq / s_desiredfps)) {
-            int64 delay = (s_tfreq / s_desiredfps) - (ttime - ltime);
-            if(delay > 0) {
-                SDL_Delay(delay / 10000);
-            }
-
-            doDelay = true;
-        }
-    } while(doDelay);
-
-    // update the "last time" to match when we want the next tick
-    if((ttime - ltime) >= ((s_tfreq * 4) / s_desiredfps)) {
-        ltime = ttime;
-    } else {
-        ltime += s_tfreq / s_desiredfps;
+    if(g_fpsScale >= 32)
+    {
+        return 0; /* Done waiting */
     }
+    uint64 time_left;
+    uint64 cur_time;
+    
+    if(!Lasttime)
+        Lasttime = SDL_GetTicks();
+    
+    if(!InFrame)
+    {
+        InFrame = 1;
+        Nexttime = Lasttime + desired_frametime * 1000;
+    }
+    
+    cur_time  = SDL_GetTicks();
+    if(cur_time >= Nexttime)
+        time_left = 0;
+    else
+        time_left = Nexttime - cur_time;
+    
+    if(time_left > 50)
+    {
+        time_left = 50;
+        /* In order to keep input responsive, don't wait too long at once */
+        /* 50 ms wait gives us a 20 Hz responsetime which is nice. */
+    }
+    else
+        InFrame = 0;
+    
+    /*fprintf(stderr, "attempting to sleep %Ld ms, frame complete=%s\n",
+        time_left, InFrame?"no":"yes");*/
+    SDL_Delay(time_left);
+    
+    if(!InFrame)
+    {
+        Lasttime = SDL_GetTicks();
+        return 0; /* Done waiting */
+    }
+    return 1; /* Must still wait some more */
 }
 
 /**
  * Set the emulation speed throttling to the next entry in the speed table.
  */
-void
-IncreaseEmulationSpeed()
+void IncreaseEmulationSpeed(void)
 {
-    int i = 0;
+    g_fpsScale *= LOGMUL;
+    
+    if(g_fpsScale > Fastest) g_fpsScale = Fastest;
 
-    // find the next entry in the FPS rate table
-    while(i < (FPS_TABLE_SIZE - 2) && s_fpsScaleTable[i] < g_fpsScale) {
-        i++;
-    }
-    g_fpsScale = s_fpsScaleTable[i+1];
-
-    // refresh the FPS throttling variables
     RefreshThrottleFPS();
-
-    FCEU_DispMessage("emulation speed %d%%",(g_fpsScale*100)>>8);
+     
+    FCEU_DispMessage("emulation speed %.1f%%", g_fpsScale*100.0);
 }
 
 /**
  * Set the emulation speed throttling to the previous entry in the speed table.
  */
-void
-DecreaseEmulationSpeed()
+void DecreaseEmulationSpeed(void)
 {
-    int i = 1;
+    g_fpsScale /= LOGMUL;
+    if(g_fpsScale < Slowest)
+        g_fpsScale = Slowest;
 
-    // find the previous entry in the FPS rate table
-    while(i < FPS_TABLE_SIZE && s_fpsScaleTable[i] < g_fpsScale) {
-        i++;
-    } 
-    g_fpsScale = s_fpsScaleTable[i - 1];
-
-    // refresh the FPS throttling variables
     RefreshThrottleFPS();
 
-    FCEU_DispMessage("emulation speed %d%%",(g_fpsScale*100)>>8);
+    FCEU_DispMessage("emulation speed %.1f%%", g_fpsScale*100.0);
 }
 
 /**
@@ -108,21 +121,24 @@ DecreaseEmulationSpeed()
 void
 FCEUD_SetEmulationSpeed(int cmd)
 {
+    MaxSpeed = false;
+    
     switch(cmd) {
     case EMUSPEED_SLOWEST:
-        g_fpsScale = s_fpsScaleTable[0];
+        g_fpsScale = Slowest;
         break;
     case EMUSPEED_SLOWER:
         DecreaseEmulationSpeed();
         break;
     case EMUSPEED_NORMAL:
-        g_fpsScale = 256;
+        g_fpsScale = Normal;
         break;
     case EMUSPEED_FASTER:
         IncreaseEmulationSpeed();
         break;
     case EMUSPEED_FASTEST:
-        g_fpsScale = s_fpsScaleTable[FPS_TABLE_SIZE - 1];
+        g_fpsScale = Fastest;
+        MaxSpeed = true;
         break;
     default:
         return;
@@ -130,5 +146,5 @@ FCEUD_SetEmulationSpeed(int cmd)
 
     RefreshThrottleFPS();
 
-    FCEU_DispMessage("emulation speed %d%%",(g_fpsScale*100)>>8);
+    FCEU_DispMessage("emulation speed %.1f%%", g_fpsScale*100.0);
 }
diff --git a/src/drivers/sdl/sdl-video.cpp b/src/drivers/sdl/sdl-video.cpp
index 3e4d1db6..5c50ed32 100644
--- a/src/drivers/sdl/sdl-video.cpp
+++ b/src/drivers/sdl/sdl-video.cpp
@@ -33,9 +33,13 @@
 #include "sdl-icon.h"
 #include "dface.h"
 
-#include "../common/configSys.h"
+#include "../common/configSys.h"
 #include "sdl-video.h"
 
+#ifdef CREATE_AVI
+#include "../videolog/nesvideos-piece.h"
+#endif
+
 // GLOBALS
 extern Config *g_config;
 
@@ -64,16 +68,18 @@ static int noframe;
 
 static int s_paletterefresh;
 
+extern bool MaxSpeed;
+
 /**
  * Attempts to destroy the graphical video display.  Returns 0 on
  * success, -1 on failure.
- */
-
+ */
+
 //draw input aids if we are fullscreen
 bool FCEUD_ShouldDrawInputAids()
 {
 	return s_fullscreen!=0;
-}
+}
  
 int
 KillVideo()
@@ -596,6 +602,80 @@ BlitScreen(uint8 *XBuf)
     SDL_UpdateRect(s_screen, xo, yo,
                    (Uint32)(NWIDTH * s_exs), (Uint32)(s_tlines * s_eys));
 
+#ifdef CREATE_AVI
+#if 0 /* PAL INTO NTSC HACK */
+ { int fps = FCEUI_GetDesiredFPS();
+ if(FCEUI_GetDesiredFPS() == 838977920) fps = 1008307711;
+ NESVideoLoggingVideo(s_screen->pixels, width,height, fps, s_curbpp);
+ if(FCEUI_GetDesiredFPS() == 838977920)
+ {
+   static unsigned dup=0;
+   if(++dup==5) { dup=0;
+   NESVideoLoggingVideo(s_screen->pixels, width,height, fps, s_curbpp); }
+ } }
+#else
+ { int fps = FCEUI_GetDesiredFPS();
+   static unsigned char* result = NULL;
+   static unsigned resultsize = 0;
+   int width = NWIDTH, height = s_tlines;
+   if(!result || resultsize != width*height*3*2)
+   {
+       if(result) free(result);
+       result = (unsigned char*) malloc(resultsize = width*height*3*2);
+   }
+   switch(s_curbpp)
+   {
+   #if 0
+     case 24: case 32: case 15: case 16:
+       /* Convert to I420 if possible, because our I420 conversion is optimized
+        * and it'll produce less network traffic, hence faster throughput than
+        * anything else. And H.264 eats only I420, so it'd be converted sooner
+        * or later anyway if we didn't do it. Win-win situation.
+        */
+       switch(s_curbpp)
+       {
+         case 32: Convert32To_I420Frame(s_screen->pixels, &result[0], width*height, width); break;
+         case 24: Convert24To_I420Frame(s_screen->pixels, &result[0], width*height, width); break;
+         case 15: Convert15To_I420Frame(s_screen->pixels, &result[0], width*height, width); break;
+         case 16: Convert16To_I420Frame(s_screen->pixels, &result[0], width*height, width); break;
+       }
+       NESVideoLoggingVideo(&result[0], width,height, fps, 12);
+       break;
+   #endif
+     default:
+       NESVideoLoggingVideo(s_screen->pixels, width,height, fps, s_curbpp);
+   }
+ }
+#endif
+
+#if REALTIME_LOGGING
+ {
+   static struct timeval last_time;
+   static int first_time=1;
+   extern long soundrate;
+   
+   struct timeval cur_time;
+   gettimeofday(&cur_time, NULL);
+   
+   double timediff =
+       (cur_time.tv_sec *1e6 + cur_time.tv_usec
+     - (last_time.tv_sec *1e6 + last_time.tv_usec)) / 1e6;
+   
+   int nframes = timediff * 60 - 1;
+   if(first_time)
+     first_time = 0;
+   else while(nframes > 0)
+   {
+     static const unsigned char Buf[800*4] = {0};
+     NESVideoLoggingVideo(screen->pixels, 256,tlines, FCEUI_GetDesiredFPS(), s_curbpp);
+     NESVideoLoggingAudio(Buf, soundrate,16,1, soundrate/60.0);
+     --nframes;
+   }
+   memcpy(&last_time, &cur_time, sizeof(last_time));
+ }
+#endif
+#endif
+
     // have to flip the displayed buffer in the case of double buffering
     if(s_screen->flags & SDL_DOUBLEBUF) {
         SDL_Flip(s_screen);
diff --git a/src/drivers/sdl/sdl.cpp b/src/drivers/sdl/sdl.cpp
index 6b1da0a7..c8a3689a 100644
--- a/src/drivers/sdl/sdl.cpp
+++ b/src/drivers/sdl/sdl.cpp
@@ -32,12 +32,18 @@
 
 #include "../common/configSys.h"
 
+#ifdef CREATE_AVI
+#include "../videolog/nesvideos-piece.h"
+#endif
+
 
 #ifdef WIN32
 #include <windows.h>
 #endif
 
-extern int32 g_fpsScale;
+extern double g_fpsScale;
+
+extern bool MaxSpeed;
 
 int CloseGame(void);
 
@@ -111,10 +117,12 @@ static void ShowUsage(char *prog)
 	puts("Options:");
 	puts(DriverUsage);
 	#ifdef _S9XLUA_H
-	puts ("--loadlua       f      Loads lua script from filename f.\n");
-	#else
-	puts("");
+	puts ("--loadlua       f      Loads lua script from filename f.");
 	#endif
+	#ifdef CREATE_AVI
+	puts ("--videolog      c      Call mencoder to grab the video and audio streams to\n                       encode them. Check the documentation for more on this.");
+	#endif
+	puts("");
 }
 
 /**
@@ -280,16 +288,48 @@ FCEUD_Update(uint8 *XBuf,
 {
     extern int FCEUDnetplay;
 
+    #ifdef CREATE_AVI
+    if(LoggingEnabled == 2 || (eoptions&EO_NOTHROTTLE))
+    {
+      if(LoggingEnabled == 2)
+      {
+        int16* MonoBuf = (int16*)malloc(sizeof(*MonoBuf) * Count);
+        int n;
+        for(n=0; n<Count; ++n)
+            MonoBuf[n] = Buffer[n] & 0xFFFF;
+        NESVideoLoggingAudio
+         (
+          MonoBuf, 
+          FSettings.SndRate, 16, 1,
+          Count
+         );
+        free(MonoBuf);
+      }
+      Count /= 2;
+      if(inited & 1)
+      {
+        if(Count > GetWriteSound()) Count = GetWriteSound();
+        if(Count > 0 && Buffer) WriteSound(Buffer,Count);   
+      }
+      if(inited & 2)
+        FCEUD_UpdateInput();
+      if(XBuf && (inited & 4)) BlitScreen(XBuf);
+      
+      //SpeedThrottle();
+        return;
+     }
+    #endif
+    
     int ocount = Count;
     // apply frame scaling to Count
-    Count = (Count<<8) / g_fpsScale;
+    Count = (int)(Count / g_fpsScale);
     if(Count) {
         int32 can=GetWriteSound();
         static int uflow=0;
         int32 tmpcan;
 
         // don't underflow when scaling fps
-        if(can >= GetMaxSound() && g_fpsScale<=256) uflow=1;	/* Go into massive underflow mode. */
+        if(can >= GetMaxSound() && g_fpsScale==1.0) uflow=1;	/* Go into massive underflow mode. */
 
         if(can > Count) can=Count;
         else uflow=0;
@@ -299,7 +339,7 @@ FCEUD_Update(uint8 *XBuf,
         //if(uflow) puts("Underflow");
         tmpcan = GetWriteSound();
         // don't underflow when scaling fps
-        if(g_fpsScale>256 || ((tmpcan < Count*0.90) && !uflow)) {
+        if(g_fpsScale>1.0 || ((tmpcan < Count*0.90) && !uflow)) {
             if(XBuf && (inited&4) && !(NoWaiting & 2))
                 BlitScreen(XBuf);
             Buffer+=can;
@@ -328,7 +368,10 @@ FCEUD_Update(uint8 *XBuf,
 
     } else {
         if(!NoWaiting && (!(eoptions&EO_NOTHROTTLE) || FCEUI_EmulationPaused()))
-            SpeedThrottle();
+        while (SpeedThrottle())
+        {
+            FCEUD_UpdateInput();
+        }
         if(XBuf && (inited&4)) {
             BlitScreen(XBuf);
         }
@@ -468,6 +511,17 @@ SDL_GL_LoadLibrary(0);
     // update the emu core
     UpdateEMUCore(g_config);
     g_config->getOption("SDL.Frameskip", &frameskip);
+    
+    #ifdef CREATE_AVI
+    {std::string tmp;
+    g_config->getOption("SDL.VideoLog", &tmp);
+    g_config->setOption("SDL.VideoLog", "");
+    if(!tmp.empty())
+    {
+        NESVideoSetVideoCmd(tmp.c_str());
+        LoggingEnabled = 1;
+    }}
+    #endif
 
     // load the specified game
     error = LoadGame(argv[romIndex]);
diff --git a/src/drivers/sdl/throttle.h b/src/drivers/sdl/throttle.h
index d6ca2aa8..d28517ef 100644
--- a/src/drivers/sdl/throttle.h
+++ b/src/drivers/sdl/throttle.h
@@ -1,2 +1,2 @@
 void RefreshThrottleFPS(void);
-void SpeedThrottle(void);
+int SpeedThrottle(void);
diff --git a/src/drivers/videolog/SConscript b/src/drivers/videolog/SConscript
new file mode 100644
index 00000000..c753ad4d
--- /dev/null
+++ b/src/drivers/videolog/SConscript
@@ -0,0 +1,15 @@
+my_list = Split("""
+nesvideos-piece.cpp
+rgbtorgb.cpp
+""")
+
+Import('env')
+
+if env['LOGO']:
+  env.Append(LIBS = ["gd"])
+  env.Append(CCFLAGS = "-DHAVE_GD")
+
+for x in range(len(my_list)):
+  my_list[x] = 'drivers/videolog/' + my_list[x]
+Return('my_list')
+
diff --git a/src/drivers/videolog/nesvideos-piece.cpp b/src/drivers/videolog/nesvideos-piece.cpp
new file mode 100644
index 00000000..5bb82b4a
--- /dev/null
+++ b/src/drivers/videolog/nesvideos-piece.cpp
@@ -0,0 +1,1247 @@
+#define THREAD_SAFETY
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <deque>
+#include <list>
+#include <map>
+
+#include <unistd.h>   // mknod, unlink, write
+#include <stdio.h>
+#include <sys/stat.h> // S_IFIFO
+#include <fcntl.h>    // fcntl
+#include <sys/poll.h> // poll
+#include <stdlib.h>   // setenv
+#include <string.h>   // strrchr
+#include <sys/file.h> // flock
+#include <errno.h>
+#include <glob.h>
+
+#ifdef HAVE_GD
+#include <gd.h>
+#endif
+
+#ifdef HAVE_X264 // don't worry, you really don't need it
+extern "C" {
+#include <x264.h>
+}
+#endif
+
+/* Note: This module assumes everyone uses BGR16 as display depth */
+
+//#define LOGO_LENGTH_HEADER  (1.2)
+//#define LOGO_LENGTH_OVERLAP (10.0-LOGO_LENGTH_HEADER)
+//#define LOGO_LENGTH_HEADER  (1.1)
+//#define LOGO_LENGTH_OVERLAP (3.95-LOGO_LENGTH_HEADER)
+//#define LOGO_LENGTH_OVERLAP (3-LOGO_LENGTH_HEADER)
+//#define LOGO_LENGTH_HEADER  (1.5)
+#define LOGO_LENGTH_OVERLAP (0)
+#define LOGO_LENGTH_HEADER (0)
+
+static std::string VIDEO_CMD = "";
+/*
+-rawvideo on:fps=60:format=0x42475220:w=256:h=224:size=$[1024*224]
+-audiofile "+AUDIO_FN+"
+*/
+static std::string AUDIO_FN = "s.log";
+
+static bool Terminate=false;
+static unsigned videonumber = 0;
+
+#ifdef THREAD_SAFETY
+# include <pthread.h>
+static pthread_mutex_t APIlock = PTHREAD_MUTEX_INITIALIZER;
+struct ScopedLock
+{ ScopedLock() { 
+                 pthread_mutex_lock(&APIlock);
+                 //fprintf(stderr, "audio start\n"); fflush(stderr);
+               }
+  ~ScopedLock() {
+                 //fprintf(stderr, "audio end\n"); fflush(stderr);
+                 pthread_mutex_unlock(&APIlock); }
+};
+#endif
+
+static unsigned NonblockWrite(FILE* fp, const unsigned char*buf, unsigned length)
+{
+  Retry:
+    int result = write(fileno(fp), buf, length);
+    if(result == -1 && errno==EAGAIN)
+    {
+        return 0;
+    }
+    if(result == -1 && errno==EINTR) goto Retry;
+    if(result == -1)
+    {
+        perror("write");
+        Terminate=true;
+        return 0;
+    }
+    return result;
+}
+static int WaitUntilOneIsWritable(FILE*f1, FILE*f2)
+{
+    struct pollfd po[2] = { {fileno(f1),POLLOUT,0}, {fileno(f2),POLLOUT,0} };
+    poll(po, 2, -1);
+    return ((po[0].revents & POLLOUT) ? 1 : 0)
+         | ((po[1].revents & POLLOUT) ? 2 : 0);
+}
+
+#define BGR32 0x42475220  // BGR32 fourcc
+#define BGR24 0x42475218  // BGR24 fourcc
+#define BGR16 0x42475210  // BGR16 fourcc
+#define BGR15 0x4247520F  // BGR15 fourcc
+#define I420  0x30323449  // I420 fourcc
+#define YUY2  0x32595559  // YUY2 fourcc
+
+static unsigned USE_FOURCC = BGR16;
+static unsigned INPUT_BPP  = 16;
+
+#define u32(n) (n)&255,((n)>>8)&255,((n)>>16)&255,((n)>>24)&255
+#define u16(n) (n)&255,((n)>>8)&255
+#define s4(s) s[0],s[1],s[2],s[3]
+
+static const unsigned FPS_SCALE = 0x1000000;
+
+static struct Construct
+{
+    Construct()
+    {
+        char Buf[4096];
+        getcwd(Buf,sizeof(Buf));
+        Buf[sizeof(Buf)-1]=0;
+        AUDIO_FN = Buf + std::string("/") + AUDIO_FN;
+    }
+} Construct;
+
+class AVI
+{
+public:
+    AVI()          { }
+    virtual ~AVI() { }
+
+    virtual void Audio
+        (unsigned r,unsigned b,unsigned c,
+         const unsigned char*d, unsigned nsamples) = 0;
+
+    virtual void Video
+        (unsigned w,unsigned h,unsigned f, const unsigned char*d) = 0;
+    
+    virtual void SaveState(const std::string&) { }
+    virtual void LoadState(const std::string&) { }
+};
+
+class NormalAVI: public AVI
+{
+    FILE* vidfp;
+    FILE* audfp;
+    
+    bool KnowVideo;
+    unsigned vid_width;
+    unsigned vid_height;
+    unsigned vid_fps_scaled;
+    std::list<std::vector<unsigned char> > VideoBuffer;
+    unsigned VidBufSize;
+    
+    bool KnowAudio;
+    unsigned aud_rate;
+    unsigned aud_chans;
+    unsigned aud_bits;
+    std::list<std::vector<unsigned char> > AudioBuffer;
+    unsigned AudBufSize;
+    
+public:
+    NormalAVI() :
+        vidfp(NULL),
+        audfp(NULL),
+        KnowVideo(false), VidBufSize(0),
+        KnowAudio(false), AudBufSize(0)
+    {
+    }
+    virtual ~NormalAVI()
+    {
+        while(VidBufSize && AudBufSize)
+        {
+            CheckFlushing();
+        }
+        if(audfp) fclose(audfp);
+        if(vidfp) pclose(vidfp);
+        unlink(AUDIO_FN.c_str());
+    }
+    
+    virtual void Audio
+        (unsigned r,unsigned b,unsigned c,
+         const unsigned char*d, unsigned nsamples)
+    {
+        if(Terminate) return;
+        if(!KnowAudio)
+        {
+            aud_rate = r;
+            aud_chans = c;
+            aud_bits = b;
+            KnowAudio = true;
+        }
+        CheckFlushing();
+        
+        unsigned bytes = nsamples * aud_chans * (aud_bits / 8);
+        
+        unsigned wrote = 0;
+        if(KnowVideo && AudioBuffer.empty())
+        {
+            //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, "aud", (void*)d, (void*)audfp);
+            wrote = NonblockWrite(audfp, d, bytes);
+            //fprintf(stderr, "Wrote %u\n", wrote);
+        }
+        if(wrote < bytes)
+        {
+            unsigned remain = bytes-wrote;
+            //fprintf(stderr, "Buffering %u of %s (%p..%p)\n", remain, "aud", d+wrote, d+bytes);
+            AudioBuffer.push_back(std::vector<unsigned char>(d+wrote, d+bytes));
+            AudBufSize += remain;
+        }
+        CheckFlushing();
+    }
+
+    virtual void Video
+        (unsigned w,unsigned h,unsigned f, const unsigned char*d)
+    {
+        if(Terminate) return;
+        if(!KnowVideo)
+        {
+            vid_width      = w;
+            vid_height     = h;
+            vid_fps_scaled = f;
+            KnowVideo = true;
+        }
+        CheckFlushing();
+        
+        unsigned bpp   = INPUT_BPP; if(bpp == 15 || bpp == 17) bpp = 16;
+        unsigned bytes = vid_width * vid_height * bpp / 8;
+        
+        //std::vector<unsigned char> tmp(bytes, 'k');
+        //d = &tmp[0];
+        
+        unsigned wrote = 0;
+        if(KnowAudio && VideoBuffer.empty())
+        {
+            CheckBegin();
+            //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, "vid", (void*)d, (void*)vidfp);
+            wrote = NonblockWrite(vidfp, d, bytes);
+            //fprintf(stderr, "Wrote %u\n", wrote);
+        }
+        
+        if(wrote < bytes)
+        {
+            unsigned remain = bytes-wrote;
+            //fprintf(stderr, "Buffering %u of %s (%p..%p)\n", remain, "vid", d+wrote, d+bytes);
+
+            VideoBuffer.push_back(std::vector<unsigned char>(d+wrote, d+bytes));
+            VidBufSize += remain;
+        }
+        CheckFlushing();
+    }
+
+private:
+    /* fp is passed as a reference because it may be NULL
+     * prior to calling, and this function changes it. */
+    template<typename BufType>
+    void FlushBufferSome(BufType& List, unsigned& Size, FILE*& fp, const char* what)
+    {
+        what=what;
+        
+    Retry:
+        if(List.empty() || Terminate) return;
+        
+        typename BufType::iterator i = List.begin();
+        std::vector<unsigned char>& buf = *i;
+        
+        if(buf.empty())
+        {
+            List.erase(i);
+            goto Retry;
+        }
+        
+        unsigned bytes = buf.size();
+        
+        CheckBegin();
+        //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, what, (void*)&buf[0], (void*)fp);
+        
+        unsigned ate = NonblockWrite(fp, &buf[0], bytes);
+
+        //fprintf(stderr, "Wrote %u\n", ate);
+        
+        buf.erase(buf.begin(), buf.begin()+ate);
+        
+        Size -= ate;
+        
+        if(buf.empty())
+        {
+            List.erase(i);
+        }
+    }
+
+    void CheckFlushing()
+    {
+        //AudioBuffer.clear();
+        //VideoBuffer.clear();
+        
+        if(KnowAudio && KnowVideo && !Terminate)
+        {
+            if(!AudioBuffer.empty() && !VideoBuffer.empty())
+            {
+                do {
+                    /* vidfp = &1, audfp = &2 */
+                    int attempt = WaitUntilOneIsWritable(vidfp, audfp);
+                    
+                    if(attempt <= 0) break; /* Some kind of error can cause this */
+
+                    // Flush Video
+                    if(attempt&1) FlushBufferSome(VideoBuffer, VidBufSize, vidfp, "vid");
+                    
+                    // Flush Audio
+                    if(attempt&2) FlushBufferSome(AudioBuffer, AudBufSize, audfp, "aud");
+                } while (!AudioBuffer.empty() && !VideoBuffer.empty());
+            }
+            else
+            {
+                FlushBufferSome(VideoBuffer, VidBufSize, vidfp, "vid");
+                FlushBufferSome(AudioBuffer, AudBufSize, audfp, "aud");
+            }
+            /*
+            fprintf(stderr, "Buffer Sizes: Audio %u(%u) video %u(%u)\n",
+                (unsigned)AudioBuffer.size(), AudBufSize,
+                (unsigned)VideoBuffer.size(), VidBufSize);
+            */
+        }
+    }
+    std::string GetMEncoderRawvideoParam() const
+    {
+        char Buf[512];
+        unsigned bpp   = INPUT_BPP; if(bpp == 15 || bpp == 17) bpp = 16;
+        sprintf(Buf, "fps=%g:format=0x%04X:w=%u:h=%u:size=%u",
+            vid_fps_scaled / (double)FPS_SCALE,
+            USE_FOURCC,
+            vid_width,
+            vid_height,
+            vid_width*vid_height * bpp/8);
+        return Buf;
+    }
+    std::string GetMEncoderRawaudioParam() const
+    {
+        char Buf[512];
+        sprintf(Buf, "channels=%u:rate=%u:samplesize=%u:bitrate=%u",
+            aud_chans,
+            aud_rate,
+            aud_bits/8,
+            aud_rate*aud_chans*(aud_bits/8) );
+        return Buf;
+    }
+    std::string GetMEncoderCommand() const
+    {
+        std::string mandatory = "-audiofile " + AUDIO_FN
+                              + " -audio-demuxer rawaudio"
+                              + " -demuxer rawvideo"
+                              + " -rawvideo " + GetMEncoderRawvideoParam()
+                              + " -rawaudio " + GetMEncoderRawaudioParam()
+                              ;
+        std::string cmd = VIDEO_CMD;
+
+        std::string::size_type p = cmd.find("NESV""SETTINGS");
+        if(p != cmd.npos)
+            cmd = cmd.replace(p, 4+8, mandatory);
+        else
+            fprintf(stderr, "Warning: NESVSETTINGS not found in videocmd\n");
+        
+        char videonumstr[64];
+        sprintf(videonumstr, "%u", videonumber);
+        
+        for(;;)
+        {
+            p = cmd.find("VIDEO""NUMBER");
+            if(p == cmd.npos) break;
+            cmd = cmd.replace(p, 5+6, videonumstr);
+        }
+        
+        fprintf(stderr, "Launch: %s\n", cmd.c_str()); fflush(stderr);
+        
+        return cmd;
+    }
+
+    void CheckBegin()
+    {
+        if(!audfp)
+        {
+            unlink(AUDIO_FN.c_str());
+            mknod(AUDIO_FN.c_str(), S_IFIFO|0666, 0);
+        }
+        
+        if(!vidfp)
+        {
+            /* Note: popen does not accept b/t in mode param */
+            setenv("LD_PRELOAD", "", 1);
+            vidfp = popen(GetMEncoderCommand().c_str(), "w");
+            if(!vidfp)
+            {
+                perror("Launch failed");
+            }
+            else
+            {
+                fcntl(fileno(vidfp), F_SETFL, O_WRONLY | O_NONBLOCK);
+            }
+        }
+        
+        if(!audfp)
+        {
+        Retry:
+            audfp = fopen(AUDIO_FN.c_str(), "wb");
+            
+            if(!audfp)
+            {
+                perror(AUDIO_FN.c_str());
+                if(errno == ESTALE) goto Retry;
+            }
+            else
+            {
+                fcntl(fileno(audfp), F_SETFL, O_WRONLY | O_NONBLOCK);
+            }
+        }
+    }
+};
+
+class RerecordingAVI: public AVI
+{
+    std::map<std::string, std::pair<off_t, off_t> > FrameStates;
+    size_t aud_framesize;
+    size_t vid_framesize;
+    
+    FILE* vidfp;
+    FILE* audfp;
+    FILE* eventfp;
+    FILE* statefp;
+    /*
+    std::string vidfn;
+    std::string audfn;
+    std::string eventfn;
+    std::string statefn;
+    */
+    
+#ifdef HAVE_X264
+    x264_t*        x264;
+    x264_param_t   param;
+    bool           forcekey;
+#endif
+    
+    class LockF
+    {
+    public:
+        LockF(FILE* f) : fp(f) { flock(fileno(fp), LOCK_EX); }
+        ~LockF()               { flock(fileno(fp), LOCK_UN); }
+    private:
+        LockF(const LockF&);
+        LockF& operator=(const LockF&);
+        FILE* fp;
+    };
+    
+public:
+    RerecordingAVI(long FrameNumber)
+        : aud_framesize(0),
+          vid_framesize(0)
+#ifdef HAVE_X264
+          ,x264(0),
+          forcekey(true)
+#endif
+    {
+        SetFn();
+    }
+    virtual ~RerecordingAVI()
+    {
+        if(eventfp)
+        {
+            off_t vidpos = ftello(vidfp);
+            off_t audpos = ftello(audfp);
+            fprintf(eventfp,
+                "%llX %llX End\n",
+                (long long)vidpos, (long long)audpos);
+        }
+        if(vidfp) fclose(vidfp);
+        if(audfp) fclose(audfp);
+        if(eventfp) fclose(eventfp);
+        if(statefp) fclose(statefp);
+#ifdef HAVE_X264
+        if(x264) x264_encoder_close(x264);
+#endif
+    }
+
+    virtual void Audio
+        (unsigned aud_rate,unsigned aud_bits,unsigned aud_chans,
+         const unsigned char*data, unsigned nsamples)
+    {
+        size_t bytes = nsamples     * aud_chans * (aud_bits / 8);
+        size_t framesize = aud_rate * aud_chans * (aud_bits / 8);
+        
+        if(framesize != aud_framesize)
+        {
+            aud_framesize = framesize;
+            LockF el(eventfp);
+            fprintf(eventfp, "AudFrameSize %lu\n", (unsigned long)aud_framesize);
+            fflush(eventfp);
+        }
+        
+        LockF al(audfp);
+        fwrite(data, 1, bytes, audfp);
+    }
+
+    virtual void Video
+        (unsigned vid_width,unsigned vid_height,
+         unsigned vid_fps_scaled, const unsigned char*data)
+    {
+        unsigned bpp   = INPUT_BPP; if(bpp == 15 || bpp == 17) bpp = 16;
+        size_t bytes = vid_width * vid_height * bpp / 8;
+        size_t framesize = bytes;
+
+        if(framesize != vid_framesize)
+        {
+            vid_framesize = framesize;
+            LockF el(eventfp);
+            fprintf(eventfp, "VidFrameSize %lu\n", (unsigned long)vid_framesize);
+            fflush(eventfp);
+        }
+
+        LockF vl(vidfp);
+        
+#ifdef HAVE_X264
+        if(bpp == 12) /* For I420, we use a local X264 encoder */
+        {
+            if(!x264)
+            {
+                x264_param_default(&param);
+                x264_param_parse(&param, "psnr", "no");
+                x264_param_parse(&param, "ssim", "no");
+                param.i_width  = vid_width;
+                param.i_height = vid_height;
+                param.i_csp    = X264_CSP_I420;
+                //param.i_scenecut_threshold = -1;
+                //param.b_bframe_adaptive     = 0;
+                //param.rc.i_rc_method      = X264_RC_CRF;
+                //param.rc.i_qp_constant    = 0;
+                x264_param_parse(&param, "me",       "dia");
+                x264_param_parse(&param, "crf",      "6");
+                x264_param_parse(&param, "frameref", "8");
+                param.i_frame_reference = 1;
+                param.analyse.i_subpel_refine = 1;
+                param.analyse.i_me_method = X264_ME_DIA;
+                /*
+                param.analyse.inter = 0;
+                param.analyse.b_transform_8x8 = 0;
+                param.analyse.b_weighted_bipred = 0;
+                param.analyse.i_trellis = 0;
+                */
+                //param.b_repeat_headers = 1; // guess this might be needed
+                
+                param.i_fps_num = vid_fps_scaled;
+                param.i_fps_den = 1 << 24;
+                
+                x264 = x264_encoder_open(&param);
+                if(!x264)
+                {
+                    fprintf(stderr, "x264_encoder_open failed.\n");
+                    goto raw_fallback;
+                }
+            }
+            
+            const size_t npixels = vid_width * vid_height;
+            x264_picture_t pic;
+            pic.i_type = forcekey ? X264_TYPE_IDR : X264_TYPE_AUTO;
+            pic.i_pts  = 0;
+            pic.i_qpplus1 = 0;
+            pic.img.i_csp = X264_CSP_I420;
+            pic.img.i_plane = 3;
+            pic.img.i_stride[0] = vid_width;
+            pic.img.i_stride[1] = vid_width / 2;
+            pic.img.i_stride[2] = vid_width / 2;
+            pic.img.plane[0] = const_cast<uint8_t*>(data) + npixels*0/4;
+            pic.img.plane[1] = const_cast<uint8_t*>(data) + npixels*4/4;
+            pic.img.plane[2] = const_cast<uint8_t*>(data) + npixels*5/4;
+            
+            x264_nal_t*    nal; int i_nal;
+            x264_picture_t pic_out;
+            if(x264_encoder_encode(x264, &nal, &i_nal, &pic, &pic_out) < 0)
+            {
+                fprintf(stderr, "x264_encoder_encode failed\n");
+                goto raw_fallback;
+            }
+            int i_size = 0;
+            for(int i=0; i<i_nal; ++i) i_size += nal[i].i_payload * 2 + 4;
+            std::vector<unsigned char> muxbuf(i_size);
+            i_size = 0;
+            for(int i=0; i<i_nal; ++i)
+            {
+                int room_required = nal[i].i_payload * 3/2 + 4;
+                if(muxbuf.size() < i_size + room_required)
+                    muxbuf.resize(i_size + room_required);
+                
+                int i_data = muxbuf.size() - i_size;
+                i_size += x264_nal_encode(&muxbuf[i_size], &i_data, 1, &nal[i]);
+            }
+            if(i_size > 0)
+                fwrite(&muxbuf[0], 1, i_size, vidfp);
+        }
+        else
+#endif
+        {
+        raw_fallback:
+            fwrite(data, 1, bytes, vidfp);
+        }
+
+        if(eventfp)
+        {
+            LockF el(eventfp);
+            off_t vidpos = ftello(vidfp);
+            off_t audpos = ftello(audfp);
+            fprintf(eventfp,
+                "%llX %llX Mark\n",
+                (long long)vidpos, (long long)audpos);
+            fflush(eventfp);
+        }
+    }
+    
+#ifdef HAVE_X264
+    virtual void SaveState(const std::string& slot)
+    {
+        LockF el(eventfp);
+        
+        off_t vidpos = ftello(vidfp);
+        off_t audpos = ftello(audfp);
+    
+        fprintf(eventfp,
+            "%llX %llX Save %s\n",
+             (long long)vidpos, (long long)audpos, slot.c_str());
+        fflush(eventfp);
+        
+        FrameStates[slot] = std::make_pair(vidpos, audpos);
+        WriteStates();
+        
+        forcekey = true;
+    }
+    
+    virtual void LoadState(const std::string& slot)
+    {
+        LockF el(eventfp);
+
+        const std::pair<off_t, off_t>& old = FrameStates[slot];
+        off_t vidpos = ftello(vidfp);
+        off_t audpos = ftello(audfp);
+        fprintf(eventfp,
+            "%llX %llX Load %llX %llX %s\n",
+            (long long)vidpos, (long long)audpos,
+            (long long)old.first,
+            (long long)old.second,
+            slot.c_str());
+        fflush(eventfp);
+
+        forcekey = true;
+    }
+#endif
+private:
+    void SetFn()
+    {
+        std::string vidfn = VIDEO_CMD + ".vid";
+        std::string audfn = VIDEO_CMD + ".aud";
+        std::string eventfn = VIDEO_CMD + ".log";
+        std::string statefn = VIDEO_CMD + ".state";
+        vidfp = fopen(vidfn.c_str(), "ab+");
+        audfp = fopen(audfn.c_str(), "ab+");
+        eventfp = fopen(eventfn.c_str(), "ab+");
+        statefp = fopen2(statefn.c_str(), "rb+", "wb+");
+        ReadStates();
+
+        if(eventfp)
+        {
+            off_t vidpos = ftello(vidfp);
+            off_t audpos = ftello(audfp);
+            fprintf(eventfp,
+                "%llX %llX Begin\n",
+                (long long)vidpos, (long long)audpos);
+        }
+    }
+    static FILE* fopen2(const char* fn, const char* mode1, const char* mode2)
+    {
+        FILE* result = fopen(fn, mode1);
+        if(!result) result = fopen(fn, mode2);
+        return result;
+    }
+    void ReadStates()
+    {
+        LockF sl(statefp);
+        
+        char Buf[4096];
+        rewind(statefp);
+        FrameStates.clear();
+        while(fgets(Buf, sizeof(Buf), statefp))
+        {
+            if(*Buf == '-') break;
+            char slotname[4096];
+            long long vidpos, audpos;
+            strtok(Buf, "\r"); strtok(Buf, "\n");
+            sscanf(Buf, "%llX %llX %4095s", &vidpos, &audpos, slotname);
+            FrameStates[slotname] = std::pair<off_t,off_t> (vidpos, audpos);
+        }
+    }
+    void WriteStates()
+    {
+        LockF sl(statefp);
+        
+        rewind(statefp);
+        for(std::map<std::string, std::pair<off_t, off_t> >::const_iterator
+            i = FrameStates.begin(); i != FrameStates.end(); ++i)
+        {
+            fprintf(statefp, "%llX %llX %s\n", 
+                (long long) i->second.first,
+                (long long) i->second.second,
+                i->first.c_str());
+        }
+        fprintf(statefp, "-\n");
+        fflush(statefp);
+    }
+};
+
+
+static AVI* AVI = 0;
+
+#ifdef HAVE_GD
+namespace LogoInfo
+{
+    unsigned width;
+    unsigned height;
+
+    bool SentVideo = false;
+    bool SentAudio = false;
+    int OverlapSent = 0;
+}
+#endif
+
+#include "quantize.h"
+#include "rgbtorgb.h"
+
+static bool RerecordingMode = false;
+static long CurrentFrameNumber = 0;
+
+extern "C"
+{
+    int LoggingEnabled = 0; /* 0=no, 1=yes, 2=recording! */
+
+    const char* NESVideoGetVideoCmd()
+    {
+        return VIDEO_CMD.c_str();
+    }
+    void NESVideoSetVideoCmd(const char *cmd)
+    {
+#ifdef THREAD_SAFETY
+        ScopedLock lock;
+#endif
+
+        VIDEO_CMD = cmd;
+    }
+    
+    void NESVideoSetRerecordingMode(long FrameNumber)
+    {
+        //const int LogoFramesOverlap = (int)( (LOGO_LENGTH_OVERLAP * fps_scaled) / (1 << 24) );
+        RerecordingMode = true;
+        CurrentFrameNumber = FrameNumber;
+#ifdef HAVE_GD
+        LogoInfo::SentVideo = FrameNumber > 0;
+        LogoInfo::SentAudio = FrameNumber > 0;
+        LogoInfo::OverlapSent = FrameNumber;
+#endif
+    }
+    
+    static class AVI& GetAVIptr()
+    {
+        if(!AVI)
+        {
+            if(RerecordingMode)
+            {
+                fprintf(stderr, "Beginning rerecording project at frame %ld\n", CurrentFrameNumber);
+                AVI = new RerecordingAVI(CurrentFrameNumber);
+            }
+            else
+            {
+                fprintf(stderr, "Starting new AVI (num %u)\n", videonumber);
+                AVI = new NormalAVI;
+            }
+        }
+        return *AVI;
+    }
+    
+    void NESVideoRerecordingSave(const char* slot)
+    {
+        GetAVIptr().SaveState(slot);
+    }
+    
+    void NESVideoRerecordingLoad(const char* slot)
+    {
+        GetAVIptr().LoadState(slot);
+    }
+    
+    void NESVideoNextAVI()
+    {
+#ifdef THREAD_SAFETY
+        ScopedLock lock;
+#endif
+
+        if(AVI)
+        {
+            fprintf(stderr, "Closing AVI (next will be started)\n");
+            delete AVI;
+            AVI = 0;
+            ++videonumber;
+        }
+    }
+
+ #ifdef HAVE_GD
+    static void Overlay32With32(unsigned char* target, const unsigned char* source, int alpha)
+    {
+        target[0] += ((int)(source[0] - target[0])) * alpha / 255;
+        target[1] += ((int)(source[1] - target[1])) * alpha / 255;
+        target[2] += ((int)(source[2] - target[2])) * alpha / 255;
+    }
+    
+    static void OverlayLogoFrom(const char* fn, std::vector<unsigned char>& data)
+    {
+        FILE*fp = fopen(fn, "rb");
+        if(!fp) perror(fn);
+        if(!fp) return; /* Silently ignore missing frames */
+        
+        gdImagePtr im = gdImageCreateFromPng(fp);
+        if(!gdImageTrueColor(im))
+        {
+          fprintf(stderr, "'%s': Only true color images are supported\n", fn);
+          goto CloseIm;
+        }
+        {/*scope begin*/
+        
+        unsigned new_width = gdImageSX(im);
+        unsigned new_height= gdImageSY(im);
+        
+        if(new_width != LogoInfo::width
+        || new_height != LogoInfo::height)
+        {
+            if(new_height < LogoInfo::height || new_height > LogoInfo::height+20)
+            fprintf(stderr, "'%s': ERROR, expected %dx%d, got %dx%d\n", fn,
+                LogoInfo::width, LogoInfo::height,
+                new_width, new_height);
+        }
+
+        for(unsigned y=0; y<LogoInfo::height; ++y)
+        {
+            unsigned char pixbuf[4] = {0,0,0,0};
+            for(unsigned x = 0; x < LogoInfo::width; ++x)
+            {
+                int color = gdImageTrueColorPixel(im, x,y);
+                int alpha = 255-gdTrueColorGetAlpha(color)*256/128;
+                pixbuf[2] = gdTrueColorGetRed(color);
+                pixbuf[1] = gdTrueColorGetGreen(color);
+                pixbuf[0] = gdTrueColorGetBlue(color);
+                Overlay32With32(&data[(y*LogoInfo::width+x)*3], pixbuf, alpha);
+            }
+        }
+        }/* close scope */
+    CloseIm:
+        gdImageDestroy(im);
+        fclose(fp);
+    }
+    
+    static const std::string GetLogoFileName(unsigned frameno)
+    {
+        std::string avdir = "/home/you/yourlogo/";
+        
+        char AvName[512];
+        sprintf(AvName, "logo_%d_%d_f%03u.png",
+            LogoInfo::width,
+            LogoInfo::height,
+            frameno);
+        
+        std::string want = avdir + AvName;
+        int ac = access(want.c_str(), R_OK);
+        if(ac != 0)
+        {
+            /* No correct avatar file? Check if there's an approximate match. */
+            static std::map<int, std::vector<std::string> > files;
+            if(files.empty()) /* Cache the list of logo files. */
+            {
+                static const char GlobPat[] = "logo_*_*_f*.png";
+                glob_t globdata;
+                globdata.gl_offs = 0;
+                fprintf(stderr, "Loading list of usable logo animation files in %s...\n", avdir.c_str());
+                int globres = glob( (avdir + GlobPat).c_str(), GLOB_NOSORT, NULL, &globdata);
+                if(globres == 0)
+                {
+                    for(size_t n=0; n<globdata.gl_pathc; ++n)
+                    {
+                        const char* fn = globdata.gl_pathv[n];
+                        const char* slash = strrchr(fn, '/');
+                        if(slash) fn = slash+1;
+                        
+                        int gotw=0, goth=0, gotf=0;
+                        sscanf(fn, "logo_%d_%d_f%d", &gotw,&goth,&gotf);
+                        files[gotf].push_back(fn);
+                    }
+                }
+                globfree(&globdata);
+            }
+            
+            std::map<int, std::vector<std::string> >::const_iterator
+                i = files.find(frameno);
+            if(i != files.end())
+            {
+                std::string best;
+                int bestdist = -1;
+                
+                const std::vector<std::string>& fnames = i->second;
+                for(size_t b=fnames.size(), a=0; a<b; ++a)
+                {
+                    unsigned gotw=0, goth=0;
+                    sscanf(fnames[a].c_str(), "logo_%u_%u", &gotw,&goth);
+                    if(gotw < LogoInfo::width || goth < LogoInfo::height) continue;
+                    
+                    int dist = std::max(gotw - LogoInfo::width,
+                                        goth - LogoInfo::height);
+                    
+                    if(bestdist == -1 || dist < bestdist)
+                        { bestdist = dist; best = fnames[a]; }
+                }
+                
+                if(bestdist >= 0) want = avdir + best;
+            }
+        }
+        return want;
+    }
+    
+    static const std::vector<unsigned char> NVConvert24To16Frame
+        (const std::vector<unsigned char>& logodata)
+    {
+        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 2);
+        Convert24To16Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
+        return result;
+    }
+    static const std::vector<unsigned char> NVConvert24To15Frame
+        (const std::vector<unsigned char>& logodata)
+    {
+        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 2);
+        Convert24To15Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
+        return result;
+    }
+    
+    static const std::vector<unsigned char> NVConvert24To_I420Frame
+        (const std::vector<unsigned char>& logodata)
+    {
+        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 3 / 2);
+        Convert24To_I420Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
+        return result;
+    }
+    
+    static const std::vector<unsigned char> NVConvert24To_YUY2Frame
+        (const std::vector<unsigned char>& logodata)
+    {
+        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 3 / 2);
+        Convert24To_YUY2Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
+        return result;
+    }
+    
+    static const std::vector<unsigned char> NVConvert16To24Frame
+        (const void* data, unsigned npixels)
+    {
+        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
+        Convert16To24Frame(data, &logodata[0], npixels);
+        return logodata;
+    }
+    
+    static const std::vector<unsigned char> NVConvert15To24Frame
+        (const void* data, unsigned npixels)
+    {
+        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
+        Convert15To24Frame(data, &logodata[0], npixels);
+        return logodata;
+    }
+    
+    static const std::vector<unsigned char> NVConvert_I420To24Frame
+        (const void* data, unsigned npixels)
+    {
+        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
+        Convert_I420To24Frame(data, &logodata[0], npixels, LogoInfo::width);
+        return logodata;
+    }
+    
+    static const std::vector<unsigned char> NVConvert_YUY2To24Frame
+        (const void* data, unsigned npixels)
+    {
+        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
+        Convert_YUY2To24Frame(data, &logodata[0], npixels, LogoInfo::width);
+        return logodata;
+    }
+    
+    static void SubstituteWithBlackIfNeeded(const void*& data)
+    {
+        /* If the first frames of the animation consist of a
+         * single color (such as gray for NES), replace them
+         * with black to avoid ugly backgrounds on logo animations
+         */
+    
+        static bool Deviate = false;
+        static short* Replacement = 0;
+        static unsigned wid=0, hei=0;
+        if(Deviate)
+        {
+            if(Replacement) { delete[] Replacement; Replacement=0; }
+            return;
+        }
+        
+        unsigned dim = LogoInfo::width * LogoInfo::height;
+        const short* p = (const short*)data;
+        for(unsigned a=0; a<dim; ++a)
+            if(p[a] != p[0])
+            {
+                Deviate = true;
+                return;
+            }
+        
+        if(Replacement && (wid != LogoInfo::width || hei != LogoInfo::height))
+        {
+            delete[] Replacement;
+            Replacement = 0;
+        }
+        
+        wid = LogoInfo::width;
+        hei = LogoInfo::height;
+        
+        if(!Replacement)
+        {
+            Replacement = new short[dim];
+            for(unsigned a=0; a<dim; ++a) Replacement[a]=0x0000;
+        }
+        data = (void*)Replacement;
+    }
+#endif
+
+    void NESVideoLoggingVideo
+        (const void*data, unsigned width,unsigned height,
+         unsigned fps_scaled,
+         unsigned bpp
+        )
+    {
+        if(LoggingEnabled < 2) return;
+        
+        ++CurrentFrameNumber;
+        
+#ifdef THREAD_SAFETY
+        ScopedLock lock;
+#endif
+
+        if(bpp == 32) /* Convert 32 to 24 */
+        {
+            bpp = 24;
+            
+            static std::vector<unsigned char> VideoBuf;
+            VideoBuf.resize(width*height * 3);
+            
+            Convert32To24Frame(data, &VideoBuf[0], width*height);
+            data = (void*)&VideoBuf[0];
+        }
+        
+        if(bpp) INPUT_BPP = bpp;
+        
+        switch(INPUT_BPP)
+        {
+            case 32: USE_FOURCC = BGR32; break;
+            case 24: USE_FOURCC = BGR24; break;
+            case 16: USE_FOURCC = BGR16; break;
+            case 15: USE_FOURCC = BGR15; break;
+            case 12: USE_FOURCC = I420; break;
+            case 17: USE_FOURCC = YUY2; break;
+        }
+        //USE_FOURCC = BGR24; // FIXME TEMPORARY
+        
+#ifdef HAVE_GD
+        const int LogoFramesHeader  = (int)( (LOGO_LENGTH_HEADER  * fps_scaled) / (1 << 24) );
+        const int LogoFramesOverlap = (int)( (LOGO_LENGTH_OVERLAP * fps_scaled) / (1 << 24) );
+        
+        LogoInfo::width  = width;
+        LogoInfo::height = height;
+        
+        if(INPUT_BPP == 16 || INPUT_BPP == 15)
+        {
+            SubstituteWithBlackIfNeeded(data);
+        }
+        else if(INPUT_BPP != 24 && INPUT_BPP != 12 && INPUT_BPP != 17)
+        {
+            fprintf(stderr, "NESVIDEOS_PIECE only supports 16 and 24 bpp, you gave %u bpp\n",
+                bpp);
+            return;
+        }
+
+        if(!LogoInfo::SentVideo)
+        {
+            /* Send animation frames that do not involve source video? */
+            LogoInfo::SentVideo=true;
+
+            if(LogoFramesHeader > 0)
+            {
+                for(int frame = 0; frame < LogoFramesHeader; ++frame)
+                {
+                    std::vector<unsigned char> logodata(width*height*3); /* filled with black. */
+                    
+                    std::string fn = GetLogoFileName(frame);
+                    /*fprintf(stderr, "wid=%d(%d), hei=%d(%d),fn=%s\n",
+                        width, LogoInfo::width,
+                        height, LogoInfo::height,
+                        fn.c_str());*/
+                    OverlayLogoFrom(fn.c_str(), logodata);
+                    
+                    //INPUT_BPP = 24; USE_FOURCC = BGR24; // FIXME TEMPORARY
+                    
+                    if(INPUT_BPP == 16)
+                    {
+                        std::vector<unsigned char> result = NVConvert24To16Frame(logodata);
+                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+                    }
+                    else if(INPUT_BPP == 15)
+                    {
+                        std::vector<unsigned char> result = NVConvert24To15Frame(logodata);
+                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+                    }
+                    else if(INPUT_BPP == 12)
+                    {
+                        std::vector<unsigned char> result = NVConvert24To_I420Frame(logodata);
+                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+                    }
+                    else if(INPUT_BPP == 17)
+                    {
+                        std::vector<unsigned char> result = NVConvert24To_YUY2Frame(logodata);
+                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+                    }
+                    else
+                    {
+                        GetAVIptr().Video(width,height,fps_scaled, &logodata[0]);
+                    }
+                }
+            }
+        }
+        
+        if(LogoInfo::OverlapSent < LogoFramesOverlap)
+        {
+            /* Send animation frames that mix source and animation? */
+
+            std::string fn = GetLogoFileName(LogoInfo::OverlapSent + LogoFramesHeader);
+            /*
+            fprintf(stderr, "wid=%d(%d), hei=%d(%d),fn=%s\n",
+                width, LogoInfo::width,
+                height, LogoInfo::height,
+                fn.c_str());*/
+
+            std::vector<unsigned char> logodata;
+            if(INPUT_BPP == 16)
+            {
+                logodata = NVConvert16To24Frame(data, width*height);
+            }
+            else if(INPUT_BPP == 15)
+            {
+                logodata = NVConvert15To24Frame(data, width*height);
+            }
+            else if(INPUT_BPP == 17)
+            {
+                logodata = NVConvert_YUY2To24Frame(data, width*height);
+            }
+            else if(INPUT_BPP == 12)
+            {
+                logodata = NVConvert_I420To24Frame(data, width*height);
+            }
+            else
+            {
+                logodata.resize(width*height*3); /* filled with black. */
+                memcpy(&logodata[0], data, width*height*3);
+            }
+
+            OverlayLogoFrom(fn.c_str(), logodata);
+            
+//            INPUT_BPP = 24; USE_FOURCC = BGR24; // FIXME TEMPORARY
+
+            if(INPUT_BPP == 16)
+            {
+                std::vector<unsigned char> result = NVConvert24To16Frame(logodata);
+                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+            }
+            else if(INPUT_BPP == 15)
+            {
+                std::vector<unsigned char> result = NVConvert24To15Frame(logodata);
+                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+            }
+            else if(INPUT_BPP == 12)
+            {
+                std::vector<unsigned char> result = NVConvert24To_I420Frame(logodata);
+                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+            }
+            else if(INPUT_BPP == 17)
+            {
+                std::vector<unsigned char> result = NVConvert24To_YUY2Frame(logodata);
+                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+            }
+            else
+            {
+                GetAVIptr().Video(width,height,fps_scaled, &logodata[0]);
+            }
+
+            ++LogoInfo::OverlapSent;
+            return;
+        }
+#endif
+        
+        GetAVIptr().Video(width,height,fps_scaled,  (const unsigned char*) data);
+    }
+
+    void NESVideoLoggingAudio
+        (const void*data,
+         unsigned rate, unsigned bits, unsigned chans,
+         unsigned nsamples)
+    {
+        if(LoggingEnabled < 2) return;
+        
+        ++CurrentFrameNumber;
+        
+#ifdef THREAD_SAFETY
+        ScopedLock lock;
+#endif
+#ifdef HAVE_GD
+        if(!LogoInfo::SentAudio && LOGO_LENGTH_HEADER > 0)
+        {
+            LogoInfo::SentAudio=true;
+            
+            double HdrLength = LOGO_LENGTH_HEADER; // N64 workaround
+            
+            const long n = (long)(rate * HdrLength)/*
+                - (rate * 0.11)*/;
+            
+            if(n > 0) {
+            unsigned bytes = n*chans*(bits/8);
+            unsigned char* buf = (unsigned char*)malloc(bytes);
+            if(buf)
+            {
+                memset(buf,0,bytes);
+                GetAVIptr().Audio(rate,bits,chans, buf, n);
+                free(buf);
+            } }
+        }
+#endif
+        
+        /*
+        fprintf(stderr, "Writing %u samples (%u bits, %u chans, %u rate)\n",
+            nsamples, bits, chans, rate);*/
+        
+        /*
+        static FILE*fp = fopen("audiodump.wav", "wb");
+        fwrite(data, 1, nsamples*(bits/8)*chans, fp);
+        fflush(fp);*/
+        
+        GetAVIptr().Audio(rate,bits,chans, (const unsigned char*) data, nsamples);
+    }
+} /* extern "C" */
diff --git a/src/drivers/videolog/nesvideos-piece.h b/src/drivers/videolog/nesvideos-piece.h
new file mode 100644
index 00000000..e9037566
--- /dev/null
+++ b/src/drivers/videolog/nesvideos-piece.h
@@ -0,0 +1,50 @@
+#ifndef NESVPIECEhh
+#define NESVPIECEhh
+
+#define NESVIDEOS_LOGGING 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Is video logging enabled? 0=no, 1=yes, 2=active. Default value: 0 */ 
+extern int LoggingEnabled; 
+
+/* Get and set the video recording command (shell command) */ 
+extern const char* NESVideoGetVideoCmd(void); 
+extern void NESVideoSetVideoCmd(const char *cmd);
+
+/* Save 1 frame of video. (Assumed to be 16-bit RGB) */ 
+/* FPS is scaled by 24 bits (*0x1000000) */
+/* Does not do anything if LoggingEnabled<2. */ 
+extern void NESVideoLoggingVideo
+    (const void*data, unsigned width, unsigned height,
+     unsigned fps_scaled,
+     unsigned bpp); 
+
+/* Save N bytes of audio. bytes_per_second is required on the first call. */ 
+/* Does not do anything if LoggingEnabled<2. */ 
+/* The interval of calling this function is not important, as long as all the audio
+ * data is eventually written without too big delay (5 seconds is too big)
+ * This function may be called multiple times per video frame, or once per a few video
+ * frames, or anything in between. Just that all audio data must be written exactly once,
+ * and in order. */ 
+extern void NESVideoLoggingAudio
+    (const void*data,
+     unsigned rate, unsigned bits, unsigned chans,
+     unsigned nsamples);
+/* nsamples*chans*(bits/8) = bytes in *data. */
+
+/* Requests current AVI to be closed and new be started */
+/* Use when encoding parameters have changed */
+extern void NESVideoNextAVI();
+
+extern void NESVideoSetRerecordingMode(long FrameNumber);
+extern void NESVideoRerecordingSave(const char* slot);
+extern void NESVideoRerecordingLoad(const char* slot);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/drivers/videolog/quantize.h b/src/drivers/videolog/quantize.h
new file mode 100644
index 00000000..b5d9f614
--- /dev/null
+++ b/src/drivers/videolog/quantize.h
@@ -0,0 +1,185 @@
+/*
+ Ordered dithering methods provided for:
+   8x8 (Quantize8x8)
+   4x4 (Quantize4x4)
+   3x3 (Quantize3x3)
+   4x2 (Quantize4x2)
+   3x2 (Quantize3x2)
+   2x2 (Quantize2x2)
+ The functions are:
+ 
+   template<int m, int in_max>
+   int QuantizeFunc(size_t quant_pos, double value)
+   
+      - Quantizes value, assumed to be in range 0..in_max, to range 0..m
+      - quant_pos tells the coordinate into the dithering matrix
+
+   template<int m, int in_max>
+   int QuantizeFunc(size_t quant_pos, unsigned value)
+
+      - Quantizes value, assumed to be in range 0..in_max, to range 0..m
+      - quant_pos tells the coordinate into the dithering matrix
+
+ Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+*/
+
+#define OrderedDitherDecl(n) \
+    static const double flts[n]; \
+    static const int ints[n]; \
+    enum { mul = n+1, \
+           maxin = in_max, \
+           even = !(maxin % mul), \
+           intmul = even ? 1 : mul };
+
+/* macroes for initializing dither tables */
+#define d(n) (n)/double(mul) - 0.5
+#define i(n) even ? (n*in_max/mul - (int)in_max/2) \
+                  : (n*in_max - (int)mul*in_max/2)
+
+template<int m, int in_max = 255>
+struct QuantizeNoDither
+{
+    int res;
+    template<typename IntType>
+    QuantizeNoDither(IntType v) : res(v * m / in_max) { }
+    operator int() const { return res; }
+};
+
+template<int m, typename Base>
+struct QuantizeFuncBase: private Base
+{
+    int res;
+    
+    QuantizeFuncBase(size_t quant_pos, double v) : res(0)
+    {
+        if(v > 0.0)
+        {
+            const double dither_threshold = Base::flts[quant_pos];
+            res = (int)(v * (m / double(Base::maxin)) + dither_threshold);
+            if(res > m) res = m;
+        }
+    }
+    
+    QuantizeFuncBase(size_t quant_pos, unsigned char v) : res(v)
+    {
+        if(m == Base::maxin) return;
+        if(m < Base::maxin)
+        {
+            // With dithering
+            const int dither_threshold = Base::ints[quant_pos];
+            const int intmul = Base::intmul;
+            res = (res * (m * intmul) + dither_threshold) / (Base::maxin * intmul);
+        }
+        else
+        {
+            // Without dithering
+            res = QuantizeNoDither<m, Base::maxin> (res);
+        }
+    }
+};
+
+#define QuantizeFuncDecl(name, base) \
+  template<int m, int in_max=255> \
+  struct name: private QuantizeFuncBase<m, base<in_max> > \
+  { \
+      typedef QuantizeFuncBase<m, base<in_max> > Base; \
+      template<typename A, typename B> name(A a, B b) : Base(a, b) { } \
+      operator int() const { return Base::res; } \
+  }
+
+/******* Quantizing with 8x8 ordered dithering ********/
+template<int in_max> struct OrderedDither_8x8 { OrderedDitherDecl(8*8) };
+    template<int in_max>
+    const double OrderedDither_8x8<in_max>::flts[] /* A table for 8x8 ordered dithering */
+    = { d(1 ), d(49), d(13), d(61), d( 4), d(52), d(16), d(64),
+        d(33), d(17), d(45), d(29), d(36), d(20), d(48), d(32),
+        d(9 ), d(57), d( 5), d(53), d(12), d(60), d( 8), d(56),
+        d(41), d(25), d(37), d(21), d(44), d(28), d(40), d(24),
+        d(3 ), d(51), d(15), d(63), d( 2), d(50), d(14), d(62),
+        d(35), d(19), d(47), d(31), d(34), d(18), d(46), d(30),
+        d(11), d(59), d( 7), d(55), d(10), d(58), d( 6), d(54),
+        d(43), d(27), d(39), d(23), d(42), d(26), d(38), d(22) };
+    template<int in_max>
+    const int OrderedDither_8x8<in_max>::ints[]
+    = { i(1 ), i(49), i(13), i(61), i( 4), i(52), i(16), i(64),
+        i(33), i(17), i(45), i(29), i(36), i(20), i(48), i(32),
+        i(9 ), i(57), i( 5), i(53), i(12), i(60), i( 8), i(56),
+        i(41), i(25), i(37), i(21), i(44), i(28), i(40), i(24),
+        i(3 ), i(51), i(15), i(63), i( 2), i(50), i(14), i(62),
+        i(35), i(19), i(47), i(31), i(34), i(18), i(46), i(30),
+        i(11), i(59), i( 7), i(55), i(10), i(58), i( 6), i(54),
+        i(43), i(27), i(39), i(23), i(42), i(26), i(38), i(22) };
+QuantizeFuncDecl(Quantize8x8, OrderedDither_8x8);
+
+
+/******* Quantizing with 4x4 ordered dithering ********/
+template<int in_max> struct OrderedDither_4x4 { OrderedDitherDecl(4*4) };
+    template<int in_max>
+    const double OrderedDither_4x4<in_max>::flts[] /* A table for 4x4 ordered dithering */
+    = { d( 1), d( 9), d( 3), d(11),
+        d(13), d( 5), d(15), d( 7),
+        d( 4), d(12), d( 2), d(10),  
+        d(16), d( 8), d(14), d( 6) };
+    template<int in_max>
+    const int OrderedDither_4x4<in_max>::ints[]
+    = { i( 1), i( 9), i( 3), i(11),
+        i(13), i( 5), i(15), i( 7),
+        i( 4), i(12), i( 2), i(10),
+        i(16), i( 8), i(14), i( 6) };
+QuantizeFuncDecl(Quantize4x4, OrderedDither_4x4);
+
+/******* Quantizing with 3x3 ordered dithering ********/
+template<int in_max> struct OrderedDither_3x3 { OrderedDitherDecl(3*3) };
+    template<int in_max>
+    const double OrderedDither_3x3<in_max>::flts[] /* A table for 3x3 ordered dithering */
+    = { d(1), d(7), d(3),
+        d(6), d(4), d(9),
+        d(8), d(2), d(5) };
+    template<int in_max>
+    const int OrderedDither_3x3<in_max>::ints[]
+    = { i(1), i(7), i(3),
+        i(6), i(4), i(9),  
+        i(8), i(2), i(5) };
+QuantizeFuncDecl(Quantize3x3, OrderedDither_3x3);
+
+/******* Quantizing with 4x2 ordered dithering ********/
+template<int in_max> struct OrderedDither_4x2 { OrderedDitherDecl(4*2) };
+    template<int in_max>
+    const double OrderedDither_4x2<in_max>::flts[] /* A table for 4x2 ordered dithering */
+    = { d(1), d(5), d(2), d(6),
+        d(7), d(3), d(8), d(4) };
+    template<int in_max>
+    const int OrderedDither_4x2<in_max>::ints[]
+    = { i(1), i(5), i(2), i(6),
+        i(7), i(3), i(8), i(4) };
+QuantizeFuncDecl(Quantize4x2, OrderedDither_4x2);
+
+/******* Quantizing with 3x2 ordered dithering ********/
+template<int in_max> struct OrderedDither_3x2 { OrderedDitherDecl(3*2) };
+    template<int in_max>
+    const double OrderedDither_3x2<in_max>::flts[] /* A table for 3x2 ordered dithering */
+    = { d(1), d(5), d(3),
+        d(4), d(2), d(6) };
+    template<int in_max>
+    const int OrderedDither_3x2<in_max>::ints[]
+    = { i(1), i(5), i(3),
+        i(4), i(2), i(6) };
+QuantizeFuncDecl(Quantize3x2, OrderedDither_3x2);
+
+/******* Quantizing with 2x2 ordered dithering ********/
+template<int in_max> struct OrderedDither_2x2 { OrderedDitherDecl(2*2) };
+    template<int in_max>
+    const double OrderedDither_2x2<in_max>::flts[] /* A table for 2x2 ordered dithering */
+    = { d(1), d(4),
+        d(3), d(2) };
+    template<int in_max>
+    const int OrderedDither_2x2<in_max>::ints[]
+    = { i(1), i(4),
+        i(3), i(2) };
+QuantizeFuncDecl(Quantize2x2, OrderedDither_2x2);
+
+
+#undef OrderedDitherDecl
+#undef QuantizeFuncDecl
+#undef i
+#undef d
diff --git a/src/drivers/videolog/rgbtorgb.cpp b/src/drivers/videolog/rgbtorgb.cpp
new file mode 100644
index 00000000..76f65602
--- /dev/null
+++ b/src/drivers/videolog/rgbtorgb.cpp
@@ -0,0 +1,1111 @@
+#include <stdint.h>
+#include <stdlib.h> // for size_t
+#include <vector>
+#include <cmath>
+
+/* RGB to RGB and RGB from/to I420 conversions written by Bisqwit
+ * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+ */
+
+typedef uint_least64_t uint64_t;
+
+#include "quantize.h"
+#include "rgbtorgb.h"
+#include "simd.h"
+
+/* For BPP conversions */
+
+static const uint64_t mask24l        __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
+static const uint64_t mask24h        __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
+static const uint64_t mask24hh       __attribute__((aligned(8))) = 0xffff000000000000ULL;
+static const uint64_t mask24hhh      __attribute__((aligned(8))) = 0xffffffff00000000ULL;
+static const uint64_t mask24hhhh     __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
+
+static const uint64_t mask64h        __attribute__((aligned(8))) = 0xFF00FF00FF00FF00ULL;
+static const uint64_t mask64l        __attribute__((aligned(8))) = 0x00FF00FF00FF00FFULL;
+static const uint64_t mask64hw       __attribute__((aligned(8))) = 0xFFFF0000FFFF0000ULL;
+static const uint64_t mask64lw       __attribute__((aligned(8))) = 0x0000FFFF0000FFFFULL;
+static const uint64_t mask64hd       __attribute__((aligned(8))) = 0xFFFFFFFF00000000ULL;
+static const uint64_t mask64ld       __attribute__((aligned(8))) = 0x00000000FFFFFFFFULL;
+
+/* For RGB2YUV: */
+
+static const int RGB2YUV_SHIFT = 15; /* highest value where [RGB][YUV] fit in signed short */
+
+static const int RY = 8414;  //  ((int)(( 65.738/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int RV = 14392; //  ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int RU = -4856; //  ((int)((-37.945/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+
+static const int GY = 16519; //  ((int)((129.057/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int GV = -12051;//  ((int)((-94.154/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int GU = -9534; //  ((int)((-74.494/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+
+static const int BY = 3208;  //  ((int)(( 25.064/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int BV = -2339; //  ((int)((-18.285/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int BU = 14392; //  ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+
+static const int Y_ADD = 16;
+static const int U_ADD = 128;
+static const int V_ADD = 128;
+
+/* For YUV2RGB: */
+
+static const int YUV2RGB_SHIFT = 13; /* highest value where UB still fits in signed short */
+
+static const int Y_REV = 9539; // ((int)( (  255 / 219.0 )     * (1<<YUV2RGB_SHIFT)+0.5));
+static const int VR = 14688;   // ((int)( ( 117504 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
+static const int VG = -6659;   // ((int)( ( -53279 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
+static const int UG = -3208;   // ((int)( ( -25675 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
+static const int UB = 16525;   // ((int)( ( 132201 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
+
+/****************/
+
+template<typename c64>
+static inline void Convert32To24_32bytes(c64 w0, c64 w1, c64 w2, c64 w3, unsigned char* dest)
+{
+    c64 r0 = (w0 & mask24l) | ((w0 >> 8) & mask24h); /* bbbaaa */
+    c64 r1 = (w1 & mask24l) | ((w1 >> 8) & mask24h); /* dddccc */
+    c64 r2 = (w2 & mask24l) | ((w2 >> 8) & mask24h); /* fffeee */
+    c64 r3 = (w3 & mask24l) | ((w3 >> 8) & mask24h); /* hhhggg */
+    
+    /* ccbbbaaa */
+    ((r0      )  | ((r1 << 48) & mask24hh)).Put(dest+0);
+    /* feeedddc */
+    ((r1 >> 16)  | ((r2 << 32) & mask24hhh)).Put(dest+8);
+    /* hhhgggff */
+    ((r2 >> 32)  | ((r3 << 16) & mask24hhhh)).Put(dest+16);
+}
+
+#if defined(__x86_64) || defined(USE_MMX)
+static void Convert32To24_32bytes(const unsigned char* src,
+                                  unsigned char* dest)
+{
+    c64 w0; w0.Get(src+0);
+    c64 w1; w1.Get(src+8);
+    c64 w2; w2.Get(src+16);
+    c64 w3; w3.Get(src+24);
+    Convert32To24_32bytes(w0,w1,w2,w3, dest);
+}
+#endif
+
+void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels)
+{
+    const unsigned char* src = (const unsigned char*)data;
+    
+    #if defined(__x86_64) || defined(USE_MMX)
+    while(npixels >= 8)
+    {
+        Convert32To24_32bytes(src, dest);
+        src  += 4*8;
+        dest += 3*8;
+        npixels -= 8;
+    }
+     #ifdef USE_MMX
+     MMX_clear();
+     #endif
+    #endif
+    
+    for(unsigned pos=0; pos<npixels; ++pos)
+    {
+        dest[3*pos+0] = src[4*pos+0];
+        dest[3*pos+1] = src[4*pos+1];
+        dest[3*pos+2] = src[4*pos+2];
+    }
+}
+
+static void Unbuild16(unsigned char* target, unsigned rgb16)
+{
+    unsigned B = (rgb16%32)*256/32;
+    unsigned G = ((rgb16/32)%64)*256/64;
+    unsigned R = ((rgb16/(32*64))%32)*256/32;
+    target[0] = R;
+    target[1] = G;
+    target[2] = B;
+}
+
+static void Unbuild15(unsigned char* target, unsigned rgb16)
+{
+    unsigned B = (rgb16%32)*256/32;
+    unsigned G = ((rgb16/32)%32)*256/32;
+    unsigned R = ((rgb16/(32*32))%32)*256/32;
+    target[0] = R;
+    target[1] = G;
+    target[2] = B;
+}
+
+template<int basevalue_lo, int basevalue_hi>
+struct Bits16const
+{
+    static const uint64_t static_value =
+       (( ((uint64_t)(unsigned short) basevalue_lo) << 0)
+      | ( ((uint64_t)(unsigned short) basevalue_hi) << 16)
+      | ( ((uint64_t)(unsigned short) basevalue_lo) << 32)
+      | ( ((uint64_t)(unsigned short) basevalue_hi) << 48));
+    static const uint64_t value;
+};
+template<int basevalue_lo, int basevalue_hi>
+const uint64_t Bits16const<basevalue_lo, basevalue_hi>::value =
+               Bits16const<basevalue_lo, basevalue_hi>::static_value;
+
+template<int basevalue_lo, int basevalue_hi>
+struct Bits32const
+{
+    static const uint64_t static_value = 
+       (( ((uint64_t)(unsigned int) basevalue_lo) << 0)
+      | ( ((uint64_t)(unsigned int) basevalue_hi) << 32));
+    static const uint64_t value = static_value;
+};/*
+template<int basevalue_lo, int basevalue_hi>
+const uint64_t Bits32const<basevalue_lo, basevalue_hi>::value =
+               Bits32const<basevalue_lo, basevalue_hi>::static_value;*/
+
+template<uint64_t basevalue_lo, uint64_t basevalue_hi>
+struct Bits8const
+{
+    static const uint64_t static_value =
+       ((basevalue_lo << 0)
+      | (basevalue_hi << 8)
+      | (basevalue_lo << 16)
+      | (basevalue_hi << 24)
+      | (basevalue_lo << 32)
+      | (basevalue_hi << 40)
+      | (basevalue_lo << 48)
+      | (basevalue_hi << 56));
+    static const uint64_t value = static_value;
+};
+
+
+template<int lowbitcount, int highbitcount, int leftshift>
+struct MaskBconst
+{
+    static const uint64_t basevalue_lo = (1 <<  lowbitcount) - 1;
+    static const uint64_t basevalue_hi = (1 << highbitcount) - 1;
+    static const uint64_t value = Bits8const<basevalue_lo,basevalue_hi>::value << leftshift;
+};
+
+template<int bits>
+struct Convert_2byte_consts
+{
+    static const uint64_t mask_lo;//   = MaskBconst<bits,0, 0>::value;
+    static const uint64_t mask_hi;//   = MaskBconst<bits,0, 8>::value;
+    static const uint64_t mask_frac;// = MaskBconst<8-bits,8-bits, 0>::value;
+};
+template<int bits>
+const uint64_t Convert_2byte_consts<bits>::mask_lo   = MaskBconst<bits, 0, 0>::value;
+template<int bits>
+const uint64_t Convert_2byte_consts<bits>::mask_hi   = MaskBconst<bits, 0, 8>::value;
+template<int bits>
+const uint64_t Convert_2byte_consts<bits>::mask_frac = MaskBconst<8-bits, 8-bits, 0>::value;
+
+template<int offs, int bits>
+struct Convert_2byte_helper
+{
+    c64 lo, hi;
+    
+    Convert_2byte_helper(c64 p4a, c64 p4b)
+    {
+        const uint64_t& mask_lo   = Convert_2byte_consts<bits>::mask_lo;
+        const uint64_t& mask_hi   = Convert_2byte_consts<bits>::mask_hi;
+        const uint64_t& mask_frac = Convert_2byte_consts<bits>::mask_frac;
+        
+        /* STEP 1: SEPARATE THE PIXELS INTO RED, GREEN AND BLUE COMPONENTS */
+
+        /* 000BBBBB 000bbbbb  000BBBBB 000bbbbb  000BBBBB 000bbbbb  000BBBBB 000bbbbb */
+        c64 s5 = ((p4a >> offs) & mask_lo) | ((p4b << (8-offs)) & mask_hi);
+
+        /* STEP 2: SCALE THE COLOR COMPONENTS TO 256 RANGE */
+        
+        /* BBBBB000 bbbbb000  BBBBB000 bbbbb000  BBBBB000 bbbbb000  BBBBB000 bbbbb000 */
+        /* 00000BBB 00000bbb  00000BBB 00000bbb  00000BBB 00000bbb  00000BBB 00000bbb */
+        c64 v8 = (s5 << (8-bits)) | ((s5 >> (bits-(8-bits))) & mask_frac);
+        /* v8:
+         *
+         * BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb *
+         */
+        
+        /* STEP 3: DEINTERLACE THE PIXELS */
+        lo = (v8     ) & mask64l;
+        hi = (v8 >> 8) & mask64l;
+    }
+};
+
+/*
+template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
+static void Convert_2byte_to_24Common(const unsigned char* src, unsigned char* dest)
+    __attribute((noinline));
+*/
+template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits, bool rgb24>
+static void Convert_2byte_to_24or32Common(const unsigned char* src, unsigned char* dest)
+{
+    c64 p4a; p4a.Get(src+0); // four pixels
+    c64 p4b; p4b.Get(src+8); // another four pixels
+    
+    /* in: In both registers: */
+    
+    Convert_2byte_helper<roffs,rbits> r(p4a,p4b);
+    Convert_2byte_helper<boffs,bbits> b(p4a,p4b);
+    Convert_2byte_helper<goffs,gbits> g(p4a,p4b);
+
+    /* STEP 4: CONVERT PIXELS INTO RGB32 */
+    
+    /* Now we have:
+     *               b.lo =  0j0g0d0a
+     *               g.lo =  0k0h0e0b
+     *               r.lo =  0l0i0f0c
+     *               b.hi =  0J0G0D0A
+     *               g.hi =  0K0H0E0B
+     *               r.hi =  0L0I0F0C
+     * We want:
+     *                 w1 =  0fed0cba
+     *                 w2 =  0lkj0ihg
+     *                 w3 =  0FED0CBA
+     *                 w4 =  0LKJ0IHG
+     */
+   
+#if 0 && defined(__MMX__) /* FIXME why is this 0&&? */
+    // punpcklbw  0k0h0e0b, 0j0g0d0a -> 00ed00ba
+    // punpcklwd  0l0i0f0c, ________ -> 0f__0c__
+    c64 w1 = r.lo.unpacklwd(0) | g.lo.unpacklbw(b.lo); // pix 0,1
+    // punpckhbw  0k0h0e0b, 0j0g0d0a -> 00kj00hg
+    // punpckhwd  0l0i0f0c, ________ -> 0l__0i__
+    c64 w2 = r.lo.unpackhwd(0) | g.lo.unpackhbw(b.lo); // pix 2,3
+    
+    c64 w3 = r.hi.unpacklwd(0) | g.hi.unpacklbw(b.hi); // pix 4,5
+    c64 w4 = r.hi.unpackhwd(0) | g.hi.unpackhbw(b.hi); // pix 6,7
+    #ifndef USE_MMX
+     MMX_clear();
+    #endif
+#else
+    /* With 64-bit registers, this code is greatly simpler than
+     * the emulation of unpack opcodes. However, when the
+     * unpack opcodes is available, using them is shorter.
+     * Which way is faster? FIXME: Find out
+     */
+
+    //        mask64lw:  00**00**
+    //        mask64hw:  **00**00
+    // b.lo & mask64lw:  000g000a
+    // g.lo & mask64lw:  000h000b
+    // r.lo & mask64lw:  000i000c
+    // b.lo & mask64hw:  0j000d00
+    // g.lo & mask64hw:  0k000e00
+    // r.lo & mask64hw:  0l000f00
+    
+    c64 tlo1 = ((b.lo & mask64lw)     ) | ((g.lo & mask64lw) << 8) | ((r.lo & mask64lw) << 16);
+    c64 tlo2 = ((b.lo & mask64hw) >>16) | ((g.lo & mask64hw) >> 8) | ((r.lo & mask64hw)      );
+
+    c64 thi1 = ((b.hi & mask64lw)     ) | ((g.hi & mask64lw) << 8) | ((r.hi & mask64lw) << 16);
+    c64 thi2 = ((b.hi & mask64hw) >>16) | ((g.hi & mask64hw) >> 8) | ((r.hi & mask64hw)      );
+    /*
+     *                tlo1 =  0ihg0cba
+     *                tlo2 =  0lkj0fed
+     *                thi1 =  0IHG0CBA
+     *                thi2 =  0LKJ0FED
+     *            mask64ld =  0000****
+     *            mask64hd =  ****0000
+     */
+     
+    c64 w1 = (tlo1 & mask64ld) | ((tlo2 & mask64ld) << 32); // 00000cba | 00000fed = 0fed0bca
+    c64 w2 = (tlo2 & mask64hd) | ((tlo1 & mask64hd) >> 32); // 0lkj0000 | 0ihg0000 = 0lkj0ihg
+
+    c64 w3 = (thi1 & mask64ld) | ((thi2 & mask64ld) << 32);
+    c64 w4 = (thi2 & mask64hd) | ((thi1 & mask64hd) >> 32);
+#endif
+    
+    if(rgb24)
+    {
+        /* STEP 5A: CONVERT PIXELS INTO RGB24 */
+        Convert32To24_32bytes(w1,w2,w3,w4, dest);
+    }
+    else
+    {
+        /* STEP 5B: STORE RGB32 */
+        w1.Put(dest+0);
+        w2.Put(dest+8);
+        w3.Put(dest+16);
+        w4.Put(dest+24);
+    }
+     
+    /*
+     punpcklbw    ____ABCD, ____abcd = AaBbCcDd
+     punpcklwd    ____ABCD, ____abcd = ABabCDcd
+     punpckldq    ____ABCD, ____abcd = ABCDabcd
+     
+     punpckhbw    ABCD____, abcd____ = AaBbCcDd
+     punpckhwd    ABCD____, abcd____ = ABabCDcd
+     punpckhdq    ABCD____, abcd____ = ABCDabcd
+    */
+}
+
+void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*)data;
+    
+    if(swap_red_blue)
+        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
+            Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, true> (src, dest);
+    else
+        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
+            Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, true> (src, dest);
+
+    #ifdef USE_MMX
+     MMX_clear();
+    #endif
+    for(unsigned a=0; a<npixels; ++a)
+    {
+        unsigned short v = ((const unsigned short*)src)[a];
+        Unbuild15(&dest[a*3], v);
+    }
+}
+
+void Convert16To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*)data;
+    
+    if(swap_red_blue)
+        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
+            Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, true> (src, dest);
+    else
+        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
+            Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, true> (src, dest);
+
+    #ifdef USE_MMX
+     MMX_clear();
+    #endif
+    for(unsigned a=0; a<npixels; ++a)
+    {
+        unsigned short v = ((const unsigned short*)src)[a];
+        Unbuild16(&dest[a*3], v);
+    }
+}
+
+void Convert15To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*)data;
+    
+    if(swap_red_blue)
+        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
+            Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, false> (src, dest);
+    else
+        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
+            Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, false> (src, dest);
+
+    #ifdef USE_MMX
+     MMX_clear();
+    #endif
+    for(unsigned a=0; a<npixels; ++a)
+    {
+        unsigned short v = ((const unsigned short*)src)[a];
+        Unbuild15(&dest[a*4], v);
+    }
+}
+
+void Convert16To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*)data;
+    
+    if(swap_red_blue)
+        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
+            Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, false> (src, dest);
+    else
+        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
+            Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, false> (src, dest);
+
+    #ifdef USE_MMX
+     MMX_clear();
+    #endif
+    for(unsigned a=0; a<npixels; ++a)
+    {
+        unsigned short v = ((const unsigned short*)src)[a];
+        Unbuild16(&dest[a*4], v);
+    }
+}
+
+static inline unsigned Build16(unsigned x,unsigned y, const unsigned char* rgbdata)
+{
+    unsigned o16 = (x + 4*y) % 16;
+    return (Quantize4x4<31>(o16, rgbdata[2]) << 0)
+         | (Quantize4x4<63>(o16, rgbdata[1]) << 5)
+         | (Quantize4x4<31>(o16, rgbdata[0]) << 11);
+}
+static inline unsigned Build15(unsigned x,unsigned y, const unsigned char* rgbdata)
+{
+    unsigned o16 = (x + 4*y) % 16;
+    return (Quantize4x4<31>(o16, rgbdata[2]) << 0)
+         | (Quantize4x4<31>(o16, rgbdata[1]) << 5)
+         | (Quantize4x4<31>(o16, rgbdata[0]) << 10);
+}
+
+void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned char* logodata = (const unsigned char*) data;
+    unsigned short* result = (unsigned short*) dest;
+    unsigned x=0,y=0;
+    for(unsigned pos=0; pos<npixels; ++pos)
+    {
+        result[pos] = Build16(x,y, &logodata[pos*3]);
+        if(++x >= width) { x=0; ++y; }
+    }
+}
+
+void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned char* logodata = (const unsigned char*) data;
+    unsigned short* result = (unsigned short*) dest;
+    unsigned x=0,y=0;
+    for(unsigned pos=0; pos<npixels; ++pos)
+    {
+        result[pos] = Build15(x,y, &logodata[pos*3]);
+        if(++x >= width) { x=0; ++y; }
+    }
+}
+
+#ifdef __MMX__
+static inline void Convert_I420_MMX_Common
+    (c64_MMX p0_1, c64_MMX p2_3,
+     unsigned char* dest_y0,
+     unsigned char* dest_y1,
+     unsigned char* dest_u,
+     unsigned char* dest_v)
+{
+    c64_MMX p0 = c64_MMX(0).unpacklbw(p0_1); // expand to 64-bit (4*16)
+    c64_MMX p1 = c64_MMX(0).unpackhbw(p0_1);
+    c64_MMX p2 = c64_MMX(0).unpacklbw(p2_3);
+    c64_MMX p3 = c64_MMX(0).unpackhbw(p2_3);
+    
+    c64_MMX ry_gy_by; ry_gy_by.Init16(RY,GY,BY, 0);
+    c64_MMX rgb_u;    rgb_u.Init16(RU,GU,BU, 0);
+    c64_MMX rgb_v;    rgb_v.Init16(RV,GV,BV, 0);
+
+    c64_MMX ctotal = p0.add16(
+                     p2.add16(
+                     p1.add16(
+                     p3)));
+  
+    p0 = _mm_madd_pi16(ry_gy_by.value, p0.value);
+    p1 = _mm_madd_pi16(ry_gy_by.value, p1.value);
+    p2 = _mm_madd_pi16(ry_gy_by.value, p2.value);
+    p3 = _mm_madd_pi16(ry_gy_by.value, p3.value);
+    
+    c64_MMX yy;
+    yy.Init16( ((p0.Extract32<0>() + p0.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p1.Extract32<0>() + p1.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p2.Extract32<0>() + p2.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p3.Extract32<0>() + p3.Extract32<1>()) >> (RGB2YUV_SHIFT)) );
+    yy = yy.add16( Bits16const<Y_ADD,Y_ADD>::value );
+    
+    // Because we're writing to adjacent pixels, we optimize this by
+    // writing two 8-bit values at once in both cases.
+    *(short*)dest_y0 = yy.Extract88_from_1616lo();
+    *(short*)dest_y1 = yy.Extract88_from_1616hi();
+    
+    c64_MMX u_total32 = _mm_madd_pi16(rgb_u.value, ctotal.value);
+    c64_MMX v_total32 = _mm_madd_pi16(rgb_v.value, ctotal.value);
+    
+    *dest_u = U_ADD + ((u_total32.Extract32<0>() + u_total32.Extract32<1>()) >> (RGB2YUV_SHIFT+2));
+    *dest_v = V_ADD + ((v_total32.Extract32<0>() + v_total32.Extract32<1>()) >> (RGB2YUV_SHIFT+2));
+}
+
+static inline void Convert_YUY2_MMX_Common
+    (c64_MMX p0_1, c64_MMX p2_3,
+     unsigned char* dest_yvyu)
+{
+    c64_MMX p0 = c64_MMX(0).unpacklbw(p0_1); // expand to 64-bit (4*16)
+    c64_MMX p1 = c64_MMX(0).unpackhbw(p0_1);
+    c64_MMX p2 = c64_MMX(0).unpacklbw(p2_3); // expand to 64-bit (4*16)
+    c64_MMX p3 = c64_MMX(0).unpackhbw(p2_3);
+    
+    c64_MMX ry_gy_by; ry_gy_by.Init16(RY,GY,BY, 0);
+    c64_MMX rgb_u;    rgb_u.Init16(RU,GU,BU, 0);
+    c64_MMX rgb_v;    rgb_v.Init16(RV,GV,BV, 0);
+
+    c64_MMX ctotal0 = p0.add16(p1);
+    c64_MMX ctotal2 = p2.add16(p3);
+  
+    p0 = _mm_madd_pi16(ry_gy_by.value, p0.value);
+    p1 = _mm_madd_pi16(ry_gy_by.value, p1.value);
+    p2 = _mm_madd_pi16(ry_gy_by.value, p2.value);
+    p3 = _mm_madd_pi16(ry_gy_by.value, p3.value);
+    
+    c64_MMX yy;
+    yy.Init16( ((p0.Extract32<0>() + p0.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p1.Extract32<0>() + p1.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p2.Extract32<0>() + p2.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p3.Extract32<0>() + p3.Extract32<1>()) >> (RGB2YUV_SHIFT)) );
+
+    yy = yy.add16( Bits16const<Y_ADD,Y_ADD>::value );
+    
+    c64_MMX u_total32_0 = _mm_madd_pi16(rgb_u.value, ctotal0.value);
+    c64_MMX v_total32_0 = _mm_madd_pi16(rgb_v.value, ctotal0.value);
+    c64_MMX u_total32_2 = _mm_madd_pi16(rgb_u.value, ctotal2.value);
+    c64_MMX v_total32_2 = _mm_madd_pi16(rgb_v.value, ctotal2.value);
+    
+    c64_MMX quadword = yy; // four y values: at 0, 2, 4 and 6
+    
+    c64_MMX uv; uv.Init16(
+        ((v_total32_0.Extract32<0>() + v_total32_0.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
+        ((u_total32_0.Extract32<0>() + u_total32_0.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
+        ((v_total32_2.Extract32<0>() + v_total32_2.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
+        ((u_total32_2.Extract32<0>() + u_total32_2.Extract32<1>()) >> (RGB2YUV_SHIFT+1)) );
+    c64_MMX uv_adds; uv_adds.Init16(V_ADD, U_ADD, V_ADD, U_ADD);
+    uv = uv.add16(uv_adds);
+    
+    quadword |= uv << 8;     // two u and v values: at 1, 3, 5 and 7.
+    quadword.Put(dest_yvyu); // write four y values: at 0, 2, 4 and 6
+}
+#endif
+
+/*template<int PixStride>
+void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+    __attribute__((noinline));*/
+
+template<int PixStride>
+void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    unsigned vpos = npixels;
+    unsigned upos = vpos + npixels / 4;
+    unsigned stride = width*PixStride;
+
+    /*fprintf(stderr, "npixels=%u, width=%u, height=%u, ypos=%u,upos=%u,vpos=%u",
+        npixels,width,height, ypos,upos,vpos);*/
+
+    /* This function is based on code from x264 svn version 711 */
+    /* TODO: Apply MMX optimization for 24-bit pixels */
+    
+    for(unsigned y=0; y<height; y += 2)
+    {
+        for(unsigned x=0; x<width; x += 2)
+        {
+        #ifdef __MMX__
+          if(PixStride == 4)
+          {
+            c64_MMX p0_1; p0_1.Get(&src[pos]);        // two 32-bit pixels (4*8)
+            c64_MMX p2_3; p2_3.Get(&src[pos+stride]); // two 32-bit pixels
+
+            pos += PixStride*2;
+            
+            Convert_I420_MMX_Common(p0_1, p2_3,
+                dest+ypos,
+                dest+ypos+width,
+                dest+upos++,
+                dest+vpos++);
+          }
+          else
+        #endif
+          {
+            int c[3], rgb[3][4];
+            
+            /* luma */
+            for(int n=0; n<3; ++n) c[n]  = rgb[n][0] = src[pos + n];
+            for(int n=0; n<3; ++n) c[n] += rgb[n][1] = src[pos + n + stride];
+            pos += PixStride;
+            
+            for(int n=0; n<3; ++n) c[n] += rgb[n][2] = src[pos + n];
+            for(int n=0; n<3; ++n) c[n] += rgb[n][3] = src[pos + n + stride];
+            pos += PixStride;
+
+            unsigned destpos[4] = { ypos, ypos+width, ypos+1, ypos+width+1 };
+            for(int n=0; n<4; ++n)
+            {
+                dest[destpos[n]]
+                    = Y_ADD + ((RY * rgb[0][n]
+                              + GY * rgb[1][n]
+                              + BY * rgb[2][n]
+                               ) >> RGB2YUV_SHIFT);  // y
+            }
+            
+            dest[upos++] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2)) );
+            dest[vpos++] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2)) ); 
+          }
+            
+            ypos += 2;
+        }
+        pos += stride;
+        ypos += width;
+    }
+    
+    /*fprintf(stderr, ",yr=%u,ur=%u,vr=%u\n",
+        ypos,upos,vpos);*/
+    
+    #ifdef __MMX__
+     MMX_clear();
+    #endif
+}
+
+template<int PixStride>
+void Convert_4byte_To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    unsigned stride = width*PixStride;
+
+    /* This function is based on code from x264 svn version 711 */
+    /* TODO: Apply MMX optimization for 24-bit pixels */
+    
+    for(unsigned y=0; y<height; ++y)
+    {
+        for(unsigned x=0; x<width; x += 2)
+        {
+        #ifdef __MMX__
+          if(PixStride == 4)
+          {
+            c64_MMX p0_1; p0_1.Get(&src[pos]);        // two 32-bit pixels (4*8)
+            pos += PixStride*2;
+            
+            c64_MMX p2_3; p2_3.Get(&src[pos]);        // two 32-bit pixels (4*8)
+            pos += PixStride*2;
+            x += 2;
+            
+            Convert_YUY2_MMX_Common(p0_1, p2_3,
+                dest+ypos);
+          
+            ypos += 4;
+          }
+          else
+        #endif
+          {
+            int c[3], rgb[3][2];
+            
+            /* luma */
+            for(int n=0; n<3; ++n) c[n]  = rgb[n][0] = src[pos + n];
+            pos += PixStride;
+            
+            for(int n=0; n<3; ++n) c[n] += rgb[n][1] = src[pos + n];
+            pos += PixStride;
+
+            for(int n=0; n<2; ++n)
+            {
+                dest[ypos + n*2]
+                    = Y_ADD + ((RY * rgb[0][n]
+                              + GY * rgb[1][n]
+                              + BY * rgb[2][n]
+                               ) >> RGB2YUV_SHIFT);  // y
+            }
+            
+            dest[ypos+3] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+1)) );
+            dest[ypos+1] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+1)) ); 
+          }
+            ypos += 4;
+        }
+    }
+    #ifdef __MMX__
+    MMX_clear();
+    #endif
+}
+
+/*template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
+void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+    __attribute__((noinline));*/
+    
+template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
+void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned PixStride = 2;
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    unsigned vpos = npixels;
+    unsigned upos = vpos + npixels / 4;
+    unsigned stride = width*PixStride;
+
+    /* This function is based on code from x264 svn version 711 */
+    
+    for(unsigned y=0; y<height; y += 2)
+    {
+        for(unsigned x=0; x<width; x += 8)
+        {
+            unsigned char Rgb2byteBuf[2][8][4];
+            
+            /* Convert 8 pixels from two scanlines (16 in total)
+             * from RGB15 / RGB16 to RGB32
+             * (Not RGB32, because RGB32 conversion is faster)
+             */
+            Convert_2byte_to_24or32Common
+                <roffs,rbits, goffs,gbits, boffs,bbits, false>
+                (src+pos,        Rgb2byteBuf[0][0]);
+
+            Convert_2byte_to_24or32Common
+                <roffs,rbits, goffs,gbits, boffs,bbits, false>
+                (src+pos+stride, Rgb2byteBuf[1][0]);
+
+            pos += 16;
+            
+            for(int x8 = 0; x8 < 8; x8 += 2)
+            {
+              #ifdef _q_MMX__
+                c64_MMX p0_1; p0_1.Get(&Rgb2byteBuf[0][x8][0]); // two 32-bit pixels (4*8)
+                c64_MMX p2_3; p2_3.Get(&Rgb2byteBuf[1][x8][0]); // two 32-bit pixels
+
+                Convert_I420_MMX_Common(p0_1, p2_3,
+                    dest+ypos,
+                    dest+ypos+width,
+                    dest+upos++,
+                    dest+vpos++);
+              #else
+                int c[3];
+                /* TODO: Some faster means than using pointers */
+                unsigned char* rgb[4] =
+                {
+                    Rgb2byteBuf[0][x8+0],
+                    Rgb2byteBuf[0][x8+1],
+                    Rgb2byteBuf[1][x8+0],
+                    Rgb2byteBuf[1][x8+1]
+                };
+                
+                for(int m=0; m<3; ++m) c[m] = 0;
+                for(int n=0; n<4; ++n)
+                    for(int m=0; m<3; ++m)
+                        c[m] += rgb[n][m];
+                
+                unsigned destpos[4] = { ypos, ypos+1, ypos+width, ypos+width+1 };
+                for(int n=0; n<4; ++n)
+                {
+                    dest[destpos[n]]
+                        = Y_ADD + ((RY * rgb[n][0]
+                                  + GY * rgb[n][1]
+                                  + BY * rgb[n][2]
+                                   ) >> RGB2YUV_SHIFT);  // y
+                }
+                
+                /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
+                // Note: +2 is because c[] contains 4 values
+                dest[upos++] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2));
+                dest[vpos++] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2)); 
+              #endif
+                ypos += 2;
+            }
+        }
+        pos += stride;
+        ypos += width;
+    }
+
+    #ifdef __MMX__
+    MMX_clear();
+    #endif
+}
+
+template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
+void Convert_2byte_To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned PixStride = 2;
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    unsigned stride = width*PixStride;
+
+    for(unsigned y=0; y<height; ++y)
+    {
+        for(unsigned x=0; x<width; x += 8)
+        {
+            unsigned char Rgb2byteBuf[8][4];
+            
+            /* Convert 8 pixels from a scanline
+             * from RGB15 / RGB16 to RGB32
+             * (Not RGB32, because RGB32 conversion is faster)
+             */
+            Convert_2byte_to_24or32Common
+                <roffs,rbits, goffs,gbits, boffs,bbits, false>
+                (src+pos, Rgb2byteBuf[0]);
+
+            pos += 16;
+            
+            for(int x8 = 0; x8 < 8; )
+            {
+              #ifdef __MMX__
+                c64_MMX p0_1; p0_1.Get(&Rgb2byteBuf[x8  ][0]); // two 32-bit pixels (4*8)
+                c64_MMX p2_3; p2_3.Get(&Rgb2byteBuf[x8+2][0]); // two 32-bit pixels (4*8)
+                Convert_YUY2_MMX_Common(p0_1, p2_3, dest+ypos);
+                x8   += 4;
+                ypos += 8;
+              #else
+                int c[3];
+                /* TODO: Some faster means than using pointers */
+                unsigned char* rgb[2] =
+                {
+                    Rgb2byteBuf[x8+0],
+                    Rgb2byteBuf[x8+1],
+                };
+                
+                for(int m=0; m<3; ++m) c[m] = 0;
+                for(int n=0; n<2; ++n)
+                    for(int m=0; m<3; ++m)
+                        c[m] += rgb[n][m];
+                
+                for(int n=0; n<2; ++n)
+                {
+                    dest[ypos + n*2]
+                        = Y_ADD + ((RY * rgb[n][0]
+                                  + GY * rgb[n][1]
+                                  + BY * rgb[n][2]
+                                   ) >> RGB2YUV_SHIFT);  // y
+                }
+                
+                /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
+                // Note: +2 is because c[] contains 4 values
+                dest[ypos+3] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+1));
+                dest[ypos+1] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+1)); 
+                x8   += 2;
+                ypos += 4;
+              #endif
+            }
+        }
+    }
+
+    #ifdef __MMX__
+    MMX_clear();
+    #endif
+}
+
+
+/***/
+
+void Convert_I420To24Frame(const void* data, unsigned char* dest,
+                           unsigned npixels, unsigned width, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    unsigned vpos = npixels;
+    unsigned upos = vpos + npixels / 4;
+
+    /*fprintf(stderr, "npixels=%u, width=%u, height=%u, ypos=%u,upos=%u,vpos=%u\n",
+        npixels,width,height, ypos,upos,vpos);*/
+    
+    #ifdef __MMX__
+    c64_MMX rgb[4], yy[4];
+    static const c64_MMX vmul/*; vmul.Init16*/(VR, VG, 0, 0);  // R,G,B,0 * vmul = V
+    static const c64_MMX umul/*; umul.Init16*/(0, UG, UB, 0);  // R,G,B,0 * umul = U
+    #endif
+    
+    /*
+        Y input: 16..235
+        U input: 16..240
+        V input: 16..240
+        
+    */
+    
+  #pragma omp parallel for
+    for(unsigned y=0; y<height; y += 2)
+    {
+        for(unsigned x=0; x<width; )
+        {
+        #ifdef __MMX__
+            rgb[0]=rgb[1]=rgb[2]=rgb[3]=yy[0]=yy[1]=yy[2]=yy[3]=c64_MMX(mask64hd)|mask64ld;
+            /* Somehow, this line above fixes an error
+             * where U&V seem to be off by 4 pixels.
+             * Probably a GCC bug? */
+            
+            /* Load 4 U and V values and subtract U_ADD and V_ADD from them. */
+            uint64_t tmp_u = *(uint32_t*)&src[upos];
+            uint64_t tmp_v = *(uint32_t*)&src[vpos];
+            c64_MMX uuq = c64_MMX(0)
+                     .unpacklbw(tmp_u) // 8-bit to 16-bit
+                     .sub16(Bits16const<U_ADD,U_ADD>::value)
+                     .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value
+            c64_MMX vvq = c64_MMX(0)
+                     .unpacklbw(tmp_v)
+                     .sub16(Bits16const<V_ADD,V_ADD>::value)
+                     .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value
+            
+            const short* uu = (const short*)&uuq;
+            const short* vv = (const short*)&vvq;
+            
+            /* c64_MMX rgb[4]; // four sets of 4*int16, each representing 1 rgb value */
+            for(int n=0; n<4; ++n)
+            {
+                /* vv is shifted by 3 bits, vmul is shifted by 13 bits
+                 * 16 bits in total, so mul16hi gets the 16-bit downscaled part */
+                c64_MMX v; v.Init16(vv[n]);
+                c64_MMX u; u.Init16(uu[n]);
+                rgb[n] = v.mul16hi(vmul).add16(
+                         u.mul16hi(umul)      );
+            }
+            
+            /* rgb[0] : U,V increment of RGB32 for x0,y0 - x1,y1
+             * rgb[1] : U,V increment of RGB32 for x2,y0 - x3,y1
+             * rgb[2] : U,V increment of RGB32 for x4,y0 - x5,y1
+             * rgb[3] : U,V increment of RGB32 for x6,y0 - x7,y1
+             */
+            
+            unsigned yyoffs[4] = { ypos, ypos+1, ypos+width, ypos+width+1 };
+            /* c64_MMX yy[4]; // four sets of 4*int16, each representing four Y values */
+            for(int n=0; n<4; ++n)
+            {
+                c64_MMX luma; luma.Init16(
+                    src[yyoffs[0]+n*2],  /* n(0..3): x0y0,x2y0,x4y0,x6y0 */
+                    src[yyoffs[1]+n*2],  /* n(0..3): x1y0,x3y0,x5y0,x7y0 */
+                    src[yyoffs[2]+n*2],  /* n(0..3): x0y1,x2y1,x4y1,x6y1 */
+                    src[yyoffs[3]+n*2]   /* n(0..3): x1y1,x3y1,x5y1,x7y1 */
+                );
+                luma = luma.sub16(Bits16const<Y_ADD,Y_ADD>::value);
+                luma = luma.shl16(16 - YUV2RGB_SHIFT);
+                yy[n] = luma.mul16hi(Bits16const<Y_REV,Y_REV>::value);
+            }
+            const short* const yyval = (const short*) &yy[0].value;
+            /*
+                values in order:
+                   x0y0 x1y0 x0y1 x1y1
+                   x2y0 x3y0 x2y1 x3y1
+                   x4y0 x5y0 x4y1 x5y1
+                   x6y0 x7y0 x6y1 x7y1
+            */
+            int tmppos = pos;
+            for(int ny = 0; ny < 4; ny += 2)
+            {
+                /* Note: We must use 16-bit pixels here instead of 8-bit,
+                 * because the rgb+Y addition can overflow. conv_s16_u8()
+                 * does the necessary clamping, which would not be done
+                 * if the values were 8-bit.
+                 */
+                // 8 pixels for one scanline, repeated twice
+                /* Note: C++ has no named constructors, so we
+                 * use statement blocks here as substitutes.
+                 */
+                c64_MMX r0
+                    = rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+0]); tmp; }) )
+                           .conv_s16_u8(
+                      rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+1]); tmp; }) ));
+                c64_MMX r1
+                    = rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+4]); tmp; }) )
+                           .conv_s16_u8(
+                      rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+5]); tmp; }) ));
+                c64_MMX r2
+                    = rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+8]); tmp; }) )
+                           .conv_s16_u8(
+                      rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+9]); tmp; }) ));
+                c64_MMX r3
+                    = rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+12]); tmp; }) )
+                           .conv_s16_u8(
+                      rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+13]); tmp; }) ));
+
+                Convert32To24_32bytes(r0,r1,r2,r3, &dest[tmppos]);
+                tmppos += width*3; // next line
+            }
+            upos += 4;
+            vpos += 4;
+            ypos += 8;   // eight bytes for this line (and eight from next too)
+            pos  += 8*3; // eight triplets generated on this line
+            x    += 8;   // eight yy values used on this line
+        #else /* non-MMX */
+            int u = src[upos] - U_ADD;
+            int v = src[vpos] - V_ADD;
+
+            int rgb[3] =
+                {
+                   (VR * v         ) >> (YUV2RGB_SHIFT),
+                   (VG * v + UG * u) >> (YUV2RGB_SHIFT),
+                   (       + UB * u) >> (YUV2RGB_SHIFT)
+                };
+            
+            unsigned incr[4] = {0,1,width,width+1};
+
+            for(unsigned r=0; r<4; ++r)
+                for(unsigned doffs=pos + incr[r]*3, yoffs=ypos + incr[r],
+                        yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT,
+                        n=0; n<3; ++n)
+                    dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy);
+
+            upos += 1;
+            vpos += 1;
+            ypos += 2; // two bytes for this line (two from next line)
+            pos  += 2*3; // two triplets generated on this line
+            x    += 2; // two yy values used on this line
+        #endif
+        }
+        ypos += width;
+        pos += 3*width;
+    }
+    #ifdef __MMX__
+    MMX_clear();
+    #endif
+}
+
+void Convert_YUY2To24Frame(const void* data, unsigned char* dest,
+                           unsigned npixels, unsigned width, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    
+    /* TODO: MMX optimization */
+    
+    /*
+        Y input: 16..235
+        U input: 16..240
+        V input: 16..240
+        
+    */
+  #pragma omp parallel for
+    for(unsigned y=0; y<height; ++y)
+    {
+        for(unsigned x=0; x<width; x += 2)
+        {
+            /* non-MMX */
+            int u = src[ypos+1] - U_ADD;
+            int v = src[ypos+3] - V_ADD;
+
+            int rgb[3] =
+                {
+                   (VR * v         ) >> (YUV2RGB_SHIFT),
+                   (VG * v + UG * u) >> (YUV2RGB_SHIFT),
+                   (       + UB * u) >> (YUV2RGB_SHIFT)
+                };
+            
+            for(unsigned r=0; r<2; ++r)
+                for(unsigned doffs=pos + r*3, yoffs=ypos+r*2,
+                        yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT,
+                        n=0; n<3; ++n)
+                    dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy);
+
+            ypos += 4; // four bytes for this line (y,u,y,v)
+            pos  += 2*3; // two triplets generated on this line
+            x    += 2; // two yy values used on this line
+        }
+    }
+}
+
+/***/
+void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_4byte_To_I420Frame<3>(data,dest,npixels,width);
+}
+void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_4byte_To_I420Frame<4>(data,dest,npixels,width);
+}
+void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_2byte_To_I420Frame<10,5, 5,5, 0,5>(data,dest,npixels,width);
+}
+void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_2byte_To_I420Frame<11,5, 5,6, 0,5>(data,dest,npixels,width);
+}
+/***/
+void Convert24To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_4byte_To_YUY2Frame<3>(data,dest,npixels,width);
+}
+void Convert32To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_4byte_To_YUY2Frame<4>(data,dest,npixels,width);
+}
+void Convert15To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_2byte_To_YUY2Frame<10,5, 5,5, 0,5>(data,dest,npixels,width);
+}
+void Convert16To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_2byte_To_YUY2Frame<11,5, 5,6, 0,5>(data,dest,npixels,width);
+}
diff --git a/src/drivers/videolog/rgbtorgb.h b/src/drivers/videolog/rgbtorgb.h
new file mode 100644
index 00000000..3f779201
--- /dev/null
+++ b/src/drivers/videolog/rgbtorgb.h
@@ -0,0 +1,68 @@
+#ifdef __cplusplus
+extern "C" {
+  #define defaulttrue =true
+#else
+  #define defaulttrue
+  #define bool       int
+#endif
+
+/* RGB to RGB and RGB from/to YCbRr (YUV) conversions written by Bisqwit
+ * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+ *
+ * Concepts:
+ *   15 = RGB15 or BGR15
+ *   16 = RGB16 or BGR16
+ *   24 = RGB24 or BGR24
+ *   32 = RGB32 or BGR32
+ * I420 = YCbCr where Y is issued for each pixel,
+ *                    followed by Cr for 2x2 pixels,
+ *                    followed by Cb for 2x2 pixels
+ * YUY2 = YCbCr where for each pixel, Y is issued,
+ *                    followed by Cr for 2x1 pixels (if even pixel)
+ *                             or Cb for 2x1 pixels (if odd pixel)
+ *
+ * Note: Not all functions honor the swap_red_blue setting.
+ */
+
+void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels)
+    __attribute__((noinline));
+
+void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert16To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert15To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert16To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+void Convert_I420To24Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+void Convert_YUY2To24Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert15To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert16To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert24To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert32To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+#ifdef __cplusplus
+}
+  #undef defaulttrue
+#else
+  #undef defaulttrue
+  #undef bool
+#endif
diff --git a/src/drivers/videolog/simd.h b/src/drivers/videolog/simd.h
new file mode 100644
index 00000000..0bf3539d
--- /dev/null
+++ b/src/drivers/videolog/simd.h
@@ -0,0 +1,365 @@
+#if defined(__MMX__) && !defined(__x86_64)
+#define USE_MMX
+#endif
+#if defined(__SSE__)
+#define USE_SSE
+#endif
+
+/* SIMD interface (MMX) written by Bisqwit
+ * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+ */
+
+#ifdef __3dNOW__
+# include <mm3dnow.h> /* Note: not available on ICC */ 
+#elif defined(__MMX__)
+# include <mmintrin.h>
+#endif
+#ifdef __SSE__
+#include <xmmintrin.h>
+ #ifdef __ICC
+ typedef __m128 __v4sf;
+ #endif
+#endif
+
+struct c64_common
+{
+    static signed char clamp_s8(int_fast64_t v)
+        { return v<-128 ? -128 : (v > 127 ? 127 : v); }
+    static unsigned char clamp_u8(int_fast64_t v)
+        { return v<0 ? 0 : (v > 255 ? 255 : v); }
+    static short clamp_s16(int_fast64_t v)
+        { return v<-32768 ? -32768 : (v > 32767 ? 32767 : v); }
+
+    static inline uint_fast64_t expand32_8(uint_fast32_t a)
+    {
+        // 0000abcd -> 0a0b0c0d
+        typedef uint_fast64_t v;
+        return (a&0xFFU)
+            | ((a&0xFF00U)<<8)    // base: 8+8 = 16
+            | ((v)(a&0xFF0000U)<<16) // base: 16+16 = 32
+            | ((v)(a&0xFF000000UL)<<24); // base: 24+24 = 48
+    }
+    static inline uint_fast64_t expand32_16(uint_fast32_t a)
+    {
+        // 0000abcd -> 00ab00cd
+        typedef uint_fast64_t v;
+        return (a&0xFFFFU)
+         | ((v)(a&0xFFFF0000UL)<<16);   // base: 16+16 = 32
+    }
+};
+
+#ifdef __MMX__
+/* 64-bit integers that use MMX / 3Dnow operations where relevant */
+struct c64_MMX: public c64_common
+{
+    typedef c64_MMX c64;
+
+    __m64 value;
+    
+    inline c64_MMX() { }
+    inline c64_MMX(__m64 v) : value(v) { }
+    inline c64_MMX(const uint64_t& v) : value( *(const __m64*)& v) { }
+    inline c64_MMX(int v) : value(_m_from_int(v)) { }
+    inline c64_MMX(short a,short b,short c, short d)
+        : value(_mm_setr_pi16(a,b,c,d)) { }
+
+    inline c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
+    inline c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
+    c64& operator<<= (int n) { return *this = shl64(n); }
+    c64& operator>>= (int n) { return *this = shr64(n); }
+
+    c64 conv_s16_u8() const { return conv_s16_u8(*this); }
+    c64 conv_s16_s8() const { return conv_s16_s8(*this); }
+
+    void Get(const unsigned char* p)      { value = *(const __m64*)p; }
+    void Put(      unsigned char* p)const { *(__m64*)p =  value; }
+    
+    void Init16(short a,short b,short c, short d)
+        { value = _mm_setr_pi16(a,b,c,d); }
+    void Init16(short a)
+        { value = _mm_set1_pi16(a); }
+
+    void GetD(const unsigned char* p)      { value = *(const __m64*)p; }
+    
+    template<int n>
+    short Extract16() const { return ((const short*)&value)[n]; }
+    template<int n>
+    int Extract32() const { return ((const int*)&value)[n]; }
+    
+    short Extract88_from_1616lo() const
+    {
+        const unsigned char* data = (const unsigned char*)&value;
+        // bytes:  76543210
+        // shorts: 33221100
+        // take:        H L
+        return data[0] | *(short*)(data+1);
+        //return data[0] | ((*(const unsigned int*)data) >> 8);
+    }
+    short Extract88_from_1616hi() const
+    {
+        const unsigned char* data = 4+(const unsigned char*)&value;
+        // bytes:  76543210
+        // shorts: 33221100
+        // take:    H L
+        return data[0] | *(short*)(data+1);
+        //return data[0] | ((*(const unsigned int*)data) >> 8);
+    }
+    
+
+    c64& operator&= (const c64& b) { value=_mm_and_si64(value,b.value); return *this; }
+    c64& operator|= (const c64& b) { value=_mm_or_si64(value,b.value); return *this; }
+    c64& operator^= (const c64& b) { value=_mm_xor_si64(value,b.value); return *this; }
+    c64& operator+= (const c64& b) { return *this = *this + b; }
+    c64& operator-= (const c64& b) { return *this = *this - b; }
+    
+    c64 operator~ () const {
+        static const uint_least64_t negpat = ~(uint_least64_t)0;
+        return c64(_mm_xor_si64(value, *(const __m64*)&negpat));
+    }
+    
+            /* psllqi: p = packed
+                       s = shift
+                       r = right, l = left
+                       l = shift in zero, a = shift in sign bit
+                       q = 64-bit, d = 32-bit, w = 16-bit
+                      [i = immed amount]
+             */
+    c64 operator& (const c64& b) const { return c64(_mm_and_si64(value,b.value)); }
+    c64 operator| (const c64& b) const { return c64(_mm_or_si64(value,b.value)); }
+    c64 operator^ (const c64& b) const { return c64(_mm_xor_si64(value,b.value)); }
+    
+    c64 operator- (const c64& b) const
+    {
+        #ifdef __SSE2__
+        return _mm_sub_si64(value, b.value);
+        #else
+        return (const uint64_t&)value - (const uint64_t&)b.value;
+        #endif
+    }
+    c64 operator+ (const c64& b) const
+    {
+        #ifdef __SSE2__
+        return _mm_add_si64(value, b.value);
+        #else
+        return (const uint64_t&)value + (const uint64_t&)b.value;
+        #endif
+    }
+    
+
+    c64 shl64(int b) const { return _mm_slli_si64(value, b); }
+    c64 shr64(int b) const { return _mm_srli_si64(value, b); }
+    c64 shl16(int b) const { return _mm_slli_pi16(value, b); }
+    c64 shr16(int b) const { return _mm_srli_pi16(value, b); }
+    c64 sar32(int b) const { return _mm_srai_pi32(value, b); }
+    c64 sar16(int b) const { return _mm_srai_pi16(value, b); }
+    c64 add32(const c64& b) const { return _mm_add_pi32(value, b.value); }
+    c64 add16(const c64& b) const { return _mm_add_pi16(value, b.value); }
+    c64 sub32(const c64& b) const { return _mm_sub_pi32(value, b.value); }
+    c64 sub16(const c64& b) const { return _mm_sub_pi16(value, b.value); }
+    c64 mul16(const c64& b) const   { return _mm_mullo_pi16(value, b.value); }
+    c64 mul16hi(const c64& b) const { return _mm_mulhi_pi16(value, b.value); }
+    //c64 mul32(const c64& b) const { return _mm_mullo_pi32(value, b.value); }
+    c64 add8(const c64& b) const { return _mm_add_pi8(value, b.value); }
+    c64 sub8(const c64& b) const { return _mm_sub_pi8(value, b.value); }
+    
+    c64 unpacklbw(const c64& b) const { return _mm_unpacklo_pi8(b.value,value); }
+    c64 unpacklwd(const c64& b) const { return _mm_unpacklo_pi16(b.value,value); }
+    c64 unpackhbw(const c64& b) const { return _mm_unpackhi_pi8(b.value,value); }
+    c64 unpackhwd(const c64& b) const { return _mm_unpackhi_pi16(b.value,value); }
+    c64 unpackldq(const c64& b) const { return _mm_unpacklo_pi32(b.value,value); }
+    c64 unpackldq() const { return _mm_unpacklo_pi32(value,value); }
+
+    c64 operator& (const uint64_t& v) { return c64(_mm_and_si64(value, *(const __m64*)& v)); }
+    
+    c64 conv_s32_s16(const c64& b) const { return _mm_packs_pi32(value, b.value); }
+    c64 conv_s16_u8(const c64& b) const { return _mm_packs_pu16(value, b.value); }
+    c64 conv_s16_s8(const c64& b) const { return _mm_packs_pi16(value, b.value); }
+};
+#endif
+
+struct c64_nonMMX: public c64_common
+{
+    typedef c64_nonMMX c64;
+    
+    uint_least64_t value;
+    
+    inline c64_nonMMX() { }
+    inline c64_nonMMX(uint64_t v) : value(v) { }
+    inline c64_nonMMX(int v) : value(v) { }
+    inline c64_nonMMX(short a,short b,short c, short d)
+        { Init16(a,b,c,d); }
+
+    c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
+    c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
+    c64& operator<<= (int n) { return *this = shl64(n); }
+    c64& operator>>= (int n) { return *this = shr64(n); }
+
+    c64 conv_s16_u8() const { return conv_s16_u8(*this); }
+    c64 conv_s16_s8() const { return conv_s16_s8(*this); }
+
+    void Init16(short a,short b,short c, short d)
+        { uint_fast64_t aa = (unsigned short)a,
+                        bb = (unsigned short)b,
+                        cc = (unsigned short)c,
+                        dd = (unsigned short)d;
+          value = aa | (bb << 16) | (cc << 32) | (dd << 48); }
+    void Init16(short a)
+        { Init16(a,a,a,a); }
+    void Init8(unsigned char a,unsigned char b,unsigned char c,unsigned char d,
+               unsigned char e,unsigned char f,unsigned char g,unsigned char h)
+    {
+        value = ((uint_fast64_t)(a | (b << 8) | (c << 16) | (d << 24)))
+              | (((uint_fast64_t)e) << 32)
+              | (((uint_fast64_t)f) << 40)
+              | (((uint_fast64_t)g) << 48)
+              | (((uint_fast64_t)h) << 56);
+    }
+
+    void Get(const unsigned char* p)      { value = *(const uint_least64_t*)p; }
+    void Put(      unsigned char* p)const { *(uint_least64_t*)p =  value; }
+    
+    c64& operator&= (const c64& b) { value&=b.value; return *this; }
+    c64& operator|= (const c64& b) { value|=b.value; return *this; }
+    c64& operator^= (const c64& b) { value^=b.value; return *this; }
+    c64& operator+= (const c64& b) { value+=b.value; return *this; }
+    c64& operator-= (const c64& b) { value-=b.value; return *this; }
+    c64 operator& (const c64& b) const { return value & b.value; }
+    c64 operator| (const c64& b) const { return value | b.value; }
+    c64 operator^ (const c64& b) const { return value ^ b.value; }
+    c64 operator- (const c64& b) const { return value - b.value; }
+    c64 operator+ (const c64& b) const { return value + b.value; }
+
+    c64 operator& (uint_fast64_t b) const { return value & b; }
+
+    c64 operator~ () const { return ~value; }
+    
+    #define usimdsim(type, count, op) \
+        type* p = (type*)&res.value; \
+        for(int n=0; n<count; ++n) p[n] = (p[n] op b)
+
+    #define simdsim(type, count, op) \
+        type* p = (type*)&res.value; \
+        const type* o = (const type*)&b.value; \
+        for(int n=0; n<count; ++n) p[n] = (p[n] op o[n])
+    
+    c64 shl64(int b) const { return value << b; }
+    c64 shr64(int b) const { return value >> b; }
+    c64 shl16(int b) const { c64 res = *this; usimdsim(short, 2, <<); return res; }
+    c64 shr16(int b) const { c64 res = *this; usimdsim(unsigned short, 2, >>); return res; }
+    c64 sar32(int b) const { c64 res = *this; usimdsim(int, 2, >>); return res; }
+    c64 sar16(int b) const { c64 res = *this; usimdsim(short, 2, >>); return res; }
+
+    c64 add16(const c64& b) const { c64 res = *this; simdsim(short, 4, +); return res; }
+    c64 sub16(const c64& b) const { c64 res = *this; simdsim(short, 4, -); return res; }
+    c64 add32(const c64& b) const { c64 res = *this; simdsim(int,   2, +); return res; }
+    c64 sub32(const c64& b) const { c64 res = *this; simdsim(int,   2, -); return res; }
+    c64 mul16(const c64& b) const { c64 res = *this; simdsim(short, 4, *); return res; }
+    c64 mul16hi(const c64& b) const { c64 res = *this; simdsim(short, 4, *) >> 16; return res; }
+    c64 add8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, +); return res; }
+    c64 sub8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, -); return res; }
+    
+    #undef simdsim
+    #undef usimdsim
+    
+    c64 conv_s32_s16(const c64& b) const
+    {
+        c64 res; res.
+        Init16(clamp_s16(value & 0xFFFFFFFFU),
+               clamp_s16(value >> 32),
+               clamp_s16(b.value & 0xFFFFFFFFU),
+               clamp_s16(b.value >> 32));
+        return res;
+    }
+    c64 conv_s16_u8(const c64& b) const
+    {
+        c64 res; res.
+        Init8(clamp_u8(value & 0xFFFF),
+              clamp_u8((value >> 16) & 0xFFFF),
+              clamp_u8((value >> 32) & 0xFFFF),
+              clamp_u8((value >> 48) & 0xFFFF),
+              clamp_u8(b.value & 0xFFFF),
+              clamp_u8((b.value >> 16) & 0xFFFF),
+              clamp_u8((b.value >> 32) & 0xFFFF),
+              clamp_u8((b.value >> 48) & 0xFFFF));
+        return res;
+    }
+    c64 conv_s16_s8(const c64& b) const
+    {
+        c64 res; res.
+        Init8(clamp_s8(value & 0xFFFF),
+              clamp_s8((value >> 16) & 0xFFFF),
+              clamp_s8((value >> 32) & 0xFFFF),
+              clamp_s8((value >> 48) & 0xFFFF),
+              clamp_s8(b.value & 0xFFFF),
+              clamp_s8((b.value >> 16) & 0xFFFF),
+              clamp_s8((b.value >> 32) & 0xFFFF),
+              clamp_s8((b.value >> 48) & 0xFFFF));
+        return res;
+    }
+
+    /* TODO: Verify that these are correct (though they should never be used anyway) */
+    c64 unpacklbw(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        /* ICC says [error: type of cast must be integral or enum]
+         * on the return value cast,
+         * so we cannot use this code on ICC. Fine for GCC. */
+        return (uint_least64_t)_m_punpcklbw(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_8(a) | (expand32_8(b) << 8);
+    #endif
+    }
+    c64 unpackhbw(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpckhbw(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_8(a>>32) | (expand32_8(b>>32) << 8);
+    #endif
+    }
+    c64 unpacklwd(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpcklwd(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_16(a) | (expand32_16(b) << 16);
+    #endif
+    }
+    c64 unpackhwd(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpckhwd(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_16(a>>32) | (expand32_16(b>>32) << 16);
+    #endif
+    }
+    c64 unpackldq() const { return unpackldq(*this); }
+    c64 unpackldq(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpckldq(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        return value | (p.value << 32);
+    #endif
+    }
+};
+
+#ifdef USE_MMX
+typedef c64_MMX c64;
+#else
+typedef c64_nonMMX c64;
+#endif
+
+static inline void MMX_clear()
+{
+    #ifdef __3dNOW__
+    _m_femms(); /* Note: not available on ICC or Valgrind */
+    //_mm_empty();
+    #elif defined(__MMX__)
+    _mm_empty();
+    #endif
+}
diff --git a/src/movie.cpp b/src/movie.cpp
index 5b76ad83..4e98e332 100644
--- a/src/movie.cpp
+++ b/src/movie.cpp
@@ -25,6 +25,10 @@
 #include "utils/memorystream.h"
 #include "utils/xstring.h"
 
+#ifdef CREATE_AVI
+#include "drivers/videolog/nesvideos-piece.h"
+#endif
+
 #ifdef WIN32
 #include <windows.h>
 #endif
@@ -61,7 +65,7 @@ EMOVIEMODE movieMode = MOVIEMODE_INACTIVE;
 
 //this should not be set unless we are in MOVIEMODE_RECORD!
 //FILE* fpRecordingMovie = 0;
-fstream* osRecordingMovie = 0;
+std::ostream* osRecordingMovie = 0;
 
 int currFrameCounter;
 uint32 cur_input_display = 0;
@@ -69,6 +73,7 @@ int pauseframe = -1;
 bool movie_readonly = true;
 int input_display = 0;
 int frame_display = 0;
+int last_displayed_framenumber = -1;
 
 SFORMAT FCEUMOV_STATEINFO[]={
 	{ &currFrameCounter, 4|FCEUSTATE_RLSB, "FCNT"},
@@ -305,8 +310,8 @@ MovieData::MovieData()
 	: version(MOVIE_VERSION)
 	, emuVersion(FCEU_VERSION_NUMERIC)
 	, palFlag(false)
-	, binaryFlag(false)
 	, rerecordCount(1)
+	, binaryFlag(false)
 	, greenZoneCount(0)
 {
 	memset(&romChecksum,0,sizeof(MD5DATA));
@@ -760,6 +765,14 @@ void FCEUI_LoadMovie(const char *fname, bool _read_only, bool tasedit, int _paus
 		else
 			FCEU_DispMessage("Replay started Read+Write.");
 	}
+	
+	#ifdef CREATE_AVI
+	if(LoggingEnabled)
+	{
+	    FCEU_DispMessage("Video recording enabled.\n");
+	    LoggingEnabled = 2;
+	}
+	#endif
 }
 
 static void openRecordingMovie(const char* fname)
@@ -916,8 +929,12 @@ void FCEUMOV_AddCommand(int cmd)
 
 void FCEU_DrawMovies(uint8 *XBuf)
 {
-	if(frame_display)
+	if(frame_display
+	&& movieMode != MOVIEMODE_INACTIVE
+	&& currFrameCounter != last_displayed_framenumber)
 	{
+		last_displayed_framenumber = currFrameCounter;
+		
 		char counterbuf[32] = {0};
 		if(movieMode == MOVIEMODE_PLAY)
 			sprintf(counterbuf,"%d/%d",currFrameCounter,currMovieData.records.size());
diff --git a/src/video.cpp b/src/video.cpp
index 5e7a0d1a..ca3d18ec 100644
--- a/src/video.cpp
+++ b/src/video.cpp
@@ -50,6 +50,10 @@
 #include "fceulua.h"
 #endif
 
+#ifdef CREATE_AVI
+#include "drivers/videolog/nesvideos-piece.h"
+#endif
+
 uint8 *XBuf=NULL;
 uint8 *XBackBuf=NULL;
 static uint8 *xbsave=NULL;
@@ -351,6 +355,17 @@ void FCEU_DispMessage(char *format, ...)
 
 	guiMessage.howlong = 180;
 	guiMessage.isMovieMessage = false;
+	
+	#ifdef CREATE_AVI
+	if(LoggingEnabled == 2)
+	{
+		/* While in AVI recording mode, only display bare minimum
+		 * of messages
+		 */
+		if(strcmp(guiMessage.errmsg, "Movie playback stopped.") != 0)
+			guiMessage.howlong = 0;
+	}
+	#endif
 }
 
 void FCEU_ResetMessages()