added SDL avi creation support and changed speed throttling to use it

2008-10-25 12:36:03 +00:00 · 2008-10-25 12:36:03 +00:00 · d9ea2a263a
parent 4262ce9054
commit d9ea2a263a
21 changed files with 3385 additions and 88 deletions
--- a/18
+++ b/18
@ -9,7 +9,9 @@ opts.AddOptions(
  BoolOption('LSB_FIRST', 'Least signficant byte first (non-PPC)', 1),
  BoolOption('DEBUG',     'Build with debugging symbols', 0),
  BoolOption('LUA',       'Enable Lua support', 1),
-  BoolOption('NEWPPU',    'Enable new PPU core', 0)
+  BoolOption('NEWPPU',    'Enable new PPU core', 0),
+  BoolOption('CREATE_AVI', 'Enable avi creation support (SDL only)', 0),
+  BoolOption('LOGO', 'Enable a logoscreen when creating avis (SDL only)', '1')
 )

 env = Environment(options = opts)
@ -86,7 +88,14 @@ else:
      print "*** WARNING ***"
      print "Zenity could not be found in the PATH.  File dialogs will not work without zenity installed."
      raw_input('Press any key to continue. . .')
-    
+  
+  ### Search for gd if we're not in Windows
+  if env['PLATFORM'] != 'win32' and env['PLATFORM'] != 'cygwin' and env['CREATE_AVI'] and env['LOGO']:
+    gd = conf.CheckLib('gd', autoadd=1)
+    if gd == 0:
+      env['LOGO'] = 0
+      print 'Did not find libgd, you won\'t be able to create a logo screen for your avis.'
+   
  if conf.CheckFunc('asprintf'):
    conf.env.Append(CCFLAGS = " -DHAVE_ASPRINTF")
  if env['OPENGL'] and conf.CheckLibWithHeader('GL', 'GL/gl.h', 'c++', autoadd=1):
@ -115,6 +124,11 @@ print "base CCFLAGS:",env['CCFLAGS']
 if env['DEBUG']:
  env.Append(CPPDEFINES=["_DEBUG"], CCFLAGS = ['-g'])

+if env['PLATFORM'] != 'win32' and env['PLATFORM'] != 'cygwin' and env['CREATE_AVI']:
+  env.Append(CPPDEFINES=["CREATE_AVI"])
+ else
+  env['CREATE_AVI']=0;
+
 Export('env')
 SConscript('src/SConscript')

--- a/changelog.txt
+++ b/changelog.txt
@ -1,4 +1,5 @@
 ---version 2.0.3 yet to be released---
+25-oct-2008 - shinydoofy - added support for AVI creation for SDL, see documention/Videolog.txt for more
 19-oct-2008 - shinydoofy - toggle lag frame counter for SDL, default hotkey F8
 19-oct-2008 - shinydoofy - toggle skipping of lag frames for SDL, default hotkey F6
 19-oct-2008 - shinydoofy - [ 2179829 ] user ability to toggle "bind savestates to movie" added for SDL, default hotkey F2
--- a/documentation/Videolog.txt
+++ b/documentation/Videolog.txt
@ -0,0 +1,45 @@
+Since SVN revision 931, FCEUX features a new option to create avi files from a recorded movie and it is relatively easy to use if you know the bare basics of mencoder.
+Call "scons CREATE_AVI=1" to activate it. You will, however, most likely need mencoder to use it.
+
+You get the raw video data via stdin and the audio data from a fifo file. Let's say you want the video to be in the best quality available, no matter how long it takes or how big the avi file might get. In order to get the NES's original video resolution and a good sound quality, you might need to set some settings beforehand or just pass them along while calling mencoder.
+
+
+Here's an example:
+./fceux \
+  --xscale 1 --yscale 1 --special 0 \
+  --pal 0 \
+  --sound 1 --soundq 1 --soundrate 48000 \
+  --nospritelim 1 \
+  --videolog "mencoder - -o myfirstencodedrun.avi \
+    -ovc x264 -x264encopts qp=0 \
+    -oac pcm \
+    -noskip -nocache -mc 0 -aspect 4/3
+    NESVSETTINGS" \
+  --playmov mymovie.fm2 myROM.nes
+
+Now let's see what is done and why we did it:
+First of all, we started fceux with "./fceux" and gave it some options:
+ --xscale and --yscale determine how much bigger the video in comparison to its regular size. It's no point to use anything other than 1 here because you can always see your video on fullscreen or at least scale it, can't you? As a nice addon, it saves time to create the avi file and also saves valuable space on your hard disk.
+ --special would usually do something fancy to your picture when you're playing a ROM, but again, it's mostly pointless to use for an avi.
+ --pal 0 lets the game run at ~60Hz. Set this so 1 if you are using a PAL region ROM.
+ --sound 1 activates sound.
+ --soundq 1 activates high quality sound.
+ --soundrate 48000 sets the sound at 48kHz.
+ --nospritelim deactivates the NES's 8 sprites per scanlines limit.
+ --videolog calls mencoder:
+  - states we're getting the video stream from stdin.
+  -o determines the name of the produced avi file.
+  -ovc x264 set the video codec to be x264 and is highly recommended for quality reasons.
+  -x264encopts qp=0 tells the x264 codec to use a quantizer of 0, which results in lossless video quality.
+  -oac pcm saves the audio data uncompressed (watch out, this might turn out really big).
+  -noskip makes sure that no frame is dropped.
+  -nocache is responsible for immediate encoding and not using any cache.
+  -mc 0 makes sure that the sound does not go out of sync.
+  -aspect 4/3 sets the avi's aspect ratio so you can see it in fullscreen and have no borders to the left and right.
+  NESVSETTINGS takes care of proper recognition of the audio and video data from FCEUX.
+  &> mencoder.log lets mencoder's output log into a file called mencoder.log in your current working directory.
+ --playmov reads which movie file we want to load (here it's mymovie.fm2) and which ROM to use for it (myROM.ns).
+
+To go for faster encoding and thus less quality, change "-ovc x264 -x264encopts qp=0" to "-ovc xvid -xvidencopts bitrate=200" and "-oac pcm" to "-oac mp3lame -lameopts mode=3:preset=60" to create a 200 kbps xvid video with 60 kbps of mono mp3 audio.
+
+One last reminder: setting all these options for FCEUX of course changes the settings you've set before (like sound quality or whether or not to scale the video image). So be sure to backup your config file first (in ~/.fceux/) if you don't want set it all up again after encoding.
--- a/src/SConscript
+++ b/src/SConscript
@ -32,7 +32,8 @@ drivers/common
 fir
 input
 utils
-mappers""")
+mappers
+""")
 #palettes

 Import('env')
@ -41,6 +42,11 @@ Export('env')
 if env['LUA']:
  file_list.append('lua-engine.cpp')

+if env['CREATE_AVI']:
+  subdirs.append('drivers/videolog')
+  
+
+
 for dir in subdirs:
  subdir_files = SConscript('%s/SConscript' % dir)
  file_list.append(subdir_files)
--- a/src/drivers/common/vidblit.cpp
+++ b/src/drivers/common/vidblit.cpp
@ -284,7 +284,7 @@ void SetPaletteBlitToHigh(uint8 *src)
 }
 }

-static void Blit32to24(uint32 *src, uint8 *dest, int xr, int yr, int dpitch)
+void Blit32to24(uint32 *src, uint8 *dest, int xr, int yr, int dpitch)
 {
 int x,y;

@ -306,7 +306,7 @@ static void Blit32to24(uint32 *src, uint8 *dest, int xr, int yr, int dpitch)
 }


-static void Blit32to16(uint32 *src, uint16 *dest, int xr, int yr, int dpitch,
+void Blit32to16(uint32 *src, uint16 *dest, int xr, int yr, int dpitch,
        int shiftr[3], int shiftl[3])
 {
 int x,y;
--- a/src/drivers/common/vidblit.h
+++ b/src/drivers/common/vidblit.h
@ -23,3 +23,7 @@ void SetPaletteBlitToHigh(uint8 *src);
 void KillBlitToHigh(void);
 void Blit8ToHigh(uint8 *src, uint8 *dest, int xr, int yr, int pitch, int xscale, int yscale);
 void Blit8To8(uint8 *src, uint8 *dest, int xr, int yr, int pitch, int xscale, int yscale, int efx, int special);
+
+void Blit32to24(uint32 *src, uint8 *dest, int xr, int yr, int dpitch);
+void Blit32to16(uint32 *src, uint16 *dest, int xr, int yr, int dpitch,
+        int shiftr[3], int shiftl[3]);
--- a/src/drivers/sdl/config.cpp
+++ b/src/drivers/sdl/config.cpp
@ -185,6 +185,10 @@ InitConfig()
    // load lua script
    config->addOption("loadlua", "SDL.LuaScript", "");
    #endif
+    
+    #ifdef CREATE_AVI
+    config->addOption("videolog",  "SDL.VideoLog",  "");
+    #endif    
 	
 	// enable new PPU core
 	config->addOption("newppu", "SDL.NewPPU", "0");
--- a/src/drivers/sdl/input.cpp
+++ b/src/drivers/sdl/input.cpp
@ -253,7 +253,7 @@ KeyboardCommands()

    // Toggle throttling
    NoWaiting &= ~1;
-    if(KEY(GRAVE)) {
+    if(KEY(TAB)) {
        NoWaiting |= 1;
    }

--- a/src/drivers/sdl/sdl-throttle.cpp
+++ b/src/drivers/sdl/sdl-throttle.cpp
@ -4,14 +4,25 @@
 #include "sdl.h"
 #include "throttle.h"

-static uint64 s_tfreq;
-static uint64 s_desiredfps;
+static const double Slowest = 0.015625; // 1/64x speed (around 1 fps on NTSC)
+static const double Fastest = 32;       // 32x speed   (around 1920 fps on NTSC)
+static const double Normal  = 1.0;      // 1x speed    (around 60 fps on NTSC)

-static int32 s_fpsScaleTable[]=
-{ 3, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048 };
-int32 g_fpsScale = 256;
+static uint64 Lasttime, Nexttime;
+static long double desired_frametime;
+static int InFrame;
+double g_fpsScale = Normal; // used by sdl.cpp
+bool MaxSpeed = false;

-#define FPS_TABLE_SIZE (sizeof(s_fpsScaleTable) / sizeof(s_fpsScaleTable[0]))
+/* LOGMUL = exp(log(2) / 3)
+ *
+ * This gives us a value such that if we do x*=LOGMUL three times,
+ * then after that, x is twice the value it was before.
+ *
+ * This gives us three speed steps per order of magnitude.
+ *
+ */
+#define LOGMUL 1.259921049894873

 /**
 * Refreshes the FPS throttling variables.
@ -19,87 +30,89 @@ int32 g_fpsScale = 256;
 void
 RefreshThrottleFPS()
 {
-    s_desiredfps = FCEUI_GetDesiredFPS() >> 8;
-    s_desiredfps = (s_desiredfps * g_fpsScale) >> 8;
-    s_tfreq = 10000000;
-    s_tfreq <<= 16; /* Adjust for fps returned from FCEUI_GetDesiredFPS(). */
+    uint64 fps = FCEUI_GetDesiredFPS(); // Do >> 24 to get in Hz
+    desired_frametime = 16777216.0l / (fps * g_fpsScale);
+
+    Lasttime=0;   
+    Nexttime=0;
+    InFrame=0;
 }

 /**
 * Perform FPS speed throttling by delaying until the next time slot.
 */
-void
+int
 SpeedThrottle()
 {
-    bool doDelay;
-
-    // XXX soules - go back through and get rid of static function variables
-    static uint64 ttime,ltime=0;
-  
-    // loop until we've delayed enough
-    do {
-        doDelay = false;
-
-        // check the current time
-        ttime = SDL_GetTicks();
-        ttime *= 10000;
-
-        if((ttime - ltime) < (s_tfreq / s_desiredfps)) {
-            int64 delay = (s_tfreq / s_desiredfps) - (ttime - ltime);
-            if(delay > 0) {
-                SDL_Delay(delay / 10000);
-            }
-
-            doDelay = true;
-        }
-    } while(doDelay);
-
-    // update the "last time" to match when we want the next tick
-    if((ttime - ltime) >= ((s_tfreq * 4) / s_desiredfps)) {
-        ltime = ttime;
-    } else {
-        ltime += s_tfreq / s_desiredfps;
+    if(g_fpsScale >= 32)
+    {
+        return 0; /* Done waiting */
    }
+    uint64 time_left;
+    uint64 cur_time;
+    
+    if(!Lasttime)
+        Lasttime = SDL_GetTicks();
+    
+    if(!InFrame)
+    {
+        InFrame = 1;
+        Nexttime = Lasttime + desired_frametime * 1000;
+    }
+    
+    cur_time  = SDL_GetTicks();
+    if(cur_time >= Nexttime)
+        time_left = 0;
+    else
+        time_left = Nexttime - cur_time;
+    
+    if(time_left > 50)
+    {
+        time_left = 50;
+        /* In order to keep input responsive, don't wait too long at once */
+        /* 50 ms wait gives us a 20 Hz responsetime which is nice. */
+    }
+    else
+        InFrame = 0;
+    
+    /*fprintf(stderr, "attempting to sleep %Ld ms, frame complete=%s\n",
+        time_left, InFrame?"no":"yes");*/
+    SDL_Delay(time_left);
+    
+    if(!InFrame)
+    {
+        Lasttime = SDL_GetTicks();
+        return 0; /* Done waiting */
+    }
+    return 1; /* Must still wait some more */
 }

 /**
 * Set the emulation speed throttling to the next entry in the speed table.
 */
-void
-IncreaseEmulationSpeed()
+void IncreaseEmulationSpeed(void)
 {
-    int i = 0;
+    g_fpsScale *= LOGMUL;
+    
+    if(g_fpsScale > Fastest) g_fpsScale = Fastest;

-    // find the next entry in the FPS rate table
-    while(i < (FPS_TABLE_SIZE - 2) && s_fpsScaleTable[i] < g_fpsScale) {
-        i++;
-    }
-    g_fpsScale = s_fpsScaleTable[i+1];
-
-    // refresh the FPS throttling variables
    RefreshThrottleFPS();
-
-    FCEU_DispMessage("emulation speed %d%%",(g_fpsScale*100)>>8);
+     
+    FCEU_DispMessage("emulation speed %.1f%%", g_fpsScale*100.0);
 }

 /**
 * Set the emulation speed throttling to the previous entry in the speed table.
 */
-void
-DecreaseEmulationSpeed()
+void DecreaseEmulationSpeed(void)
 {
-    int i = 1;
+    g_fpsScale /= LOGMUL;
+    if(g_fpsScale < Slowest)
+        g_fpsScale = Slowest;

-    // find the previous entry in the FPS rate table
-    while(i < FPS_TABLE_SIZE && s_fpsScaleTable[i] < g_fpsScale) {
-        i++;
-    } 
-    g_fpsScale = s_fpsScaleTable[i - 1];
-
-    // refresh the FPS throttling variables
    RefreshThrottleFPS();

-    FCEU_DispMessage("emulation speed %d%%",(g_fpsScale*100)>>8);
+    FCEU_DispMessage("emulation speed %.1f%%", g_fpsScale*100.0);
 }

 /**
@ -108,21 +121,24 @@ DecreaseEmulationSpeed()
 void
 FCEUD_SetEmulationSpeed(int cmd)
 {
+    MaxSpeed = false;
+    
    switch(cmd) {
    case EMUSPEED_SLOWEST:
-        g_fpsScale = s_fpsScaleTable[0];
+        g_fpsScale = Slowest;
        break;
    case EMUSPEED_SLOWER:
        DecreaseEmulationSpeed();
        break;
    case EMUSPEED_NORMAL:
-        g_fpsScale = 256;
+        g_fpsScale = Normal;
        break;
    case EMUSPEED_FASTER:
        IncreaseEmulationSpeed();
        break;
    case EMUSPEED_FASTEST:
-        g_fpsScale = s_fpsScaleTable[FPS_TABLE_SIZE - 1];
+        g_fpsScale = Fastest;
+        MaxSpeed = true;
        break;
    default:
        return;
@ -130,5 +146,5 @@ FCEUD_SetEmulationSpeed(int cmd)

    RefreshThrottleFPS();

-    FCEU_DispMessage("emulation speed %d%%",(g_fpsScale*100)>>8);
+    FCEU_DispMessage("emulation speed %.1f%%", g_fpsScale*100.0);
 }
--- a/src/drivers/sdl/sdl-video.cpp
+++ b/src/drivers/sdl/sdl-video.cpp
@ -33,9 +33,13 @@
 #include "sdl-icon.h"
 #include "dface.h"

-#include "../common/configSys.h"
+#include "../common/configSys.h"
 #include "sdl-video.h"

+#ifdef CREATE_AVI
+#include "../videolog/nesvideos-piece.h"
+#endif
+
 // GLOBALS
 extern Config *g_config;

@ -64,16 +68,18 @@ static int noframe;

 static int s_paletterefresh;

+extern bool MaxSpeed;
+
 /**
 * Attempts to destroy the graphical video display.  Returns 0 on
 * success, -1 on failure.
- */
-
+ */
+
 //draw input aids if we are fullscreen
 bool FCEUD_ShouldDrawInputAids()
 {
 	return s_fullscreen!=0;
-}
+}
 
 int
 KillVideo()
@ -596,6 +602,80 @@ BlitScreen(uint8 *XBuf)
    SDL_UpdateRect(s_screen, xo, yo,
                   (Uint32)(NWIDTH * s_exs), (Uint32)(s_tlines * s_eys));

+#ifdef CREATE_AVI
+#if 0 /* PAL INTO NTSC HACK */
+ { int fps = FCEUI_GetDesiredFPS();
+ if(FCEUI_GetDesiredFPS() == 838977920) fps = 1008307711;
+ NESVideoLoggingVideo(s_screen->pixels, width,height, fps, s_curbpp);
+ if(FCEUI_GetDesiredFPS() == 838977920)
+ {
+   static unsigned dup=0;
+   if(++dup==5) { dup=0;
+   NESVideoLoggingVideo(s_screen->pixels, width,height, fps, s_curbpp); }
+ } }
+#else
+ { int fps = FCEUI_GetDesiredFPS();
+   static unsigned char* result = NULL;
+   static unsigned resultsize = 0;
+   int width = NWIDTH, height = s_tlines;
+   if(!result || resultsize != width*height*3*2)
+   {
+       if(result) free(result);
+       result = (unsigned char*) malloc(resultsize = width*height*3*2);
+   }
+   switch(s_curbpp)
+   {
+   #if 0
+     case 24: case 32: case 15: case 16:
+       /* Convert to I420 if possible, because our I420 conversion is optimized
+        * and it'll produce less network traffic, hence faster throughput than
+        * anything else. And H.264 eats only I420, so it'd be converted sooner
+        * or later anyway if we didn't do it. Win-win situation.
+        */
+       switch(s_curbpp)
+       {
+         case 32: Convert32To_I420Frame(s_screen->pixels, &result[0], width*height, width); break;
+         case 24: Convert24To_I420Frame(s_screen->pixels, &result[0], width*height, width); break;
+         case 15: Convert15To_I420Frame(s_screen->pixels, &result[0], width*height, width); break;
+         case 16: Convert16To_I420Frame(s_screen->pixels, &result[0], width*height, width); break;
+       }
+       NESVideoLoggingVideo(&result[0], width,height, fps, 12);
+       break;
+   #endif
+     default:
+       NESVideoLoggingVideo(s_screen->pixels, width,height, fps, s_curbpp);
+   }
+ }
+#endif
+
+#if REALTIME_LOGGING
+ {
+   static struct timeval last_time;
+   static int first_time=1;
+   extern long soundrate;
+   
+   struct timeval cur_time;
+   gettimeofday(&cur_time, NULL);
+   
+   double timediff =
+       (cur_time.tv_sec *1e6 + cur_time.tv_usec
+     - (last_time.tv_sec *1e6 + last_time.tv_usec)) / 1e6;
+   
+   int nframes = timediff * 60 - 1;
+   if(first_time)
+     first_time = 0;
+   else while(nframes > 0)
+   {
+     static const unsigned char Buf[800*4] = {0};
+     NESVideoLoggingVideo(screen->pixels, 256,tlines, FCEUI_GetDesiredFPS(), s_curbpp);
+     NESVideoLoggingAudio(Buf, soundrate,16,1, soundrate/60.0);
+     --nframes;
+   }
+   memcpy(&last_time, &cur_time, sizeof(last_time));
+ }
+#endif
+#endif
+
    // have to flip the displayed buffer in the case of double buffering
    if(s_screen->flags & SDL_DOUBLEBUF) {
        SDL_Flip(s_screen);
--- a/src/drivers/sdl/sdl.cpp
+++ b/src/drivers/sdl/sdl.cpp
@ -32,12 +32,18 @@

 #include "../common/configSys.h"

+#ifdef CREATE_AVI
+#include "../videolog/nesvideos-piece.h"
+#endif
+

 #ifdef WIN32
 #include <windows.h>
 #endif

-extern int32 g_fpsScale;
+extern double g_fpsScale;
+
+extern bool MaxSpeed;

 int CloseGame(void);

@ -111,10 +117,12 @@ static void ShowUsage(char *prog)
 	puts("Options:");
 	puts(DriverUsage);
 	#ifdef _S9XLUA_H
-	puts ("--loadlua       f      Loads lua script from filename f.\n");
-	#else
-	puts("");
+	puts ("--loadlua       f      Loads lua script from filename f.");
 	#endif
+	#ifdef CREATE_AVI
+	puts ("--videolog      c      Call mencoder to grab the video and audio streams to\n                       encode them. Check the documentation for more on this.");
+	#endif
+	puts("");
 }

 /**
@ -280,16 +288,48 @@ FCEUD_Update(uint8 *XBuf,
 {
    extern int FCEUDnetplay;

+    #ifdef CREATE_AVI
+    if(LoggingEnabled == 2 || (eoptions&EO_NOTHROTTLE))
+    {
+      if(LoggingEnabled == 2)
+      {
+        int16* MonoBuf = (int16*)malloc(sizeof(*MonoBuf) * Count);
+        int n;
+        for(n=0; n<Count; ++n)
+            MonoBuf[n] = Buffer[n] & 0xFFFF;
+        NESVideoLoggingAudio
+         (
+          MonoBuf, 
+          FSettings.SndRate, 16, 1,
+          Count
+         );
+        free(MonoBuf);
+      }
+      Count /= 2;
+      if(inited & 1)
+      {
+        if(Count > GetWriteSound()) Count = GetWriteSound();
+        if(Count > 0 && Buffer) WriteSound(Buffer,Count);   
+      }
+      if(inited & 2)
+        FCEUD_UpdateInput();
+      if(XBuf && (inited & 4)) BlitScreen(XBuf);
+      
+      //SpeedThrottle();
+        return;
+     }
+    #endif
+    
    int ocount = Count;
    // apply frame scaling to Count
-    Count = (Count<<8) / g_fpsScale;
+    Count = (int)(Count / g_fpsScale);
    if(Count) {
        int32 can=GetWriteSound();
        static int uflow=0;
        int32 tmpcan;

        // don't underflow when scaling fps
-        if(can >= GetMaxSound() && g_fpsScale<=256) uflow=1;	/* Go into massive underflow mode. */
+        if(can >= GetMaxSound() && g_fpsScale==1.0) uflow=1;	/* Go into massive underflow mode. */

        if(can > Count) can=Count;
        else uflow=0;
@ -299,7 +339,7 @@ FCEUD_Update(uint8 *XBuf,
        //if(uflow) puts("Underflow");
        tmpcan = GetWriteSound();
        // don't underflow when scaling fps
-        if(g_fpsScale>256 || ((tmpcan < Count*0.90) && !uflow)) {
+        if(g_fpsScale>1.0 || ((tmpcan < Count*0.90) && !uflow)) {
            if(XBuf && (inited&4) && !(NoWaiting & 2))
                BlitScreen(XBuf);
            Buffer+=can;
@ -328,7 +368,10 @@ FCEUD_Update(uint8 *XBuf,

    } else {
        if(!NoWaiting && (!(eoptions&EO_NOTHROTTLE) || FCEUI_EmulationPaused()))
-            SpeedThrottle();
+        while (SpeedThrottle())
+        {
+            FCEUD_UpdateInput();
+        }
        if(XBuf && (inited&4)) {
            BlitScreen(XBuf);
        }
@ -468,6 +511,17 @@ SDL_GL_LoadLibrary(0);
    // update the emu core
    UpdateEMUCore(g_config);
    g_config->getOption("SDL.Frameskip", &frameskip);
+    
+    #ifdef CREATE_AVI
+    {std::string tmp;
+    g_config->getOption("SDL.VideoLog", &tmp);
+    g_config->setOption("SDL.VideoLog", "");
+    if(!tmp.empty())
+    {
+        NESVideoSetVideoCmd(tmp.c_str());
+        LoggingEnabled = 1;
+    }}
+    #endif

    // load the specified game
    error = LoadGame(argv[romIndex]);
--- a/src/drivers/sdl/throttle.h
+++ b/src/drivers/sdl/throttle.h
@ -1,2 +1,2 @@
 void RefreshThrottleFPS(void);
-void SpeedThrottle(void);
+int SpeedThrottle(void);
--- a/src/drivers/videolog/SConscript
+++ b/src/drivers/videolog/SConscript
@ -0,0 +1,15 @@
+my_list = Split("""
+nesvideos-piece.cpp
+rgbtorgb.cpp
+""")
+
+Import('env')
+
+if env['LOGO']:
+  env.Append(LIBS = ["gd"])
+  env.Append(CCFLAGS = "-DHAVE_GD")
+
+for x in range(len(my_list)):
+  my_list[x] = 'drivers/videolog/' + my_list[x]
+Return('my_list')
+
--- a/src/drivers/videolog/nesvideos-piece.cpp
+++ b/src/drivers/videolog/nesvideos-piece.cpp
--- a/src/drivers/videolog/nesvideos-piece.h
+++ b/src/drivers/videolog/nesvideos-piece.h
@ -0,0 +1,50 @@
+#ifndef NESVPIECEhh
+#define NESVPIECEhh
+
+#define NESVIDEOS_LOGGING 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Is video logging enabled? 0=no, 1=yes, 2=active. Default value: 0 */ 
+extern int LoggingEnabled; 
+
+/* Get and set the video recording command (shell command) */ 
+extern const char* NESVideoGetVideoCmd(void); 
+extern void NESVideoSetVideoCmd(const char *cmd);
+
+/* Save 1 frame of video. (Assumed to be 16-bit RGB) */ 
+/* FPS is scaled by 24 bits (*0x1000000) */
+/* Does not do anything if LoggingEnabled<2. */ 
+extern void NESVideoLoggingVideo
+    (const void*data, unsigned width, unsigned height,
+     unsigned fps_scaled,
+     unsigned bpp); 
+
+/* Save N bytes of audio. bytes_per_second is required on the first call. */ 
+/* Does not do anything if LoggingEnabled<2. */ 
+/* The interval of calling this function is not important, as long as all the audio
+ * data is eventually written without too big delay (5 seconds is too big)
+ * This function may be called multiple times per video frame, or once per a few video
+ * frames, or anything in between. Just that all audio data must be written exactly once,
+ * and in order. */ 
+extern void NESVideoLoggingAudio
+    (const void*data,
+     unsigned rate, unsigned bits, unsigned chans,
+     unsigned nsamples);
+/* nsamples*chans*(bits/8) = bytes in *data. */
+
+/* Requests current AVI to be closed and new be started */
+/* Use when encoding parameters have changed */
+extern void NESVideoNextAVI();
+
+extern void NESVideoSetRerecordingMode(long FrameNumber);
+extern void NESVideoRerecordingSave(const char* slot);
+extern void NESVideoRerecordingLoad(const char* slot);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/drivers/videolog/quantize.h
+++ b/src/drivers/videolog/quantize.h
@ -0,0 +1,185 @@
+/*
+ Ordered dithering methods provided for:
+   8x8 (Quantize8x8)
+   4x4 (Quantize4x4)
+   3x3 (Quantize3x3)
+   4x2 (Quantize4x2)
+   3x2 (Quantize3x2)
+   2x2 (Quantize2x2)
+ The functions are:
+ 
+   template<int m, int in_max>
+   int QuantizeFunc(size_t quant_pos, double value)
+   
+      - Quantizes value, assumed to be in range 0..in_max, to range 0..m
+      - quant_pos tells the coordinate into the dithering matrix
+
+   template<int m, int in_max>
+   int QuantizeFunc(size_t quant_pos, unsigned value)
+
+      - Quantizes value, assumed to be in range 0..in_max, to range 0..m
+      - quant_pos tells the coordinate into the dithering matrix
+
+ Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+*/
+
+#define OrderedDitherDecl(n) \
+    static const double flts[n]; \
+    static const int ints[n]; \
+    enum { mul = n+1, \
+           maxin = in_max, \
+           even = !(maxin % mul), \
+           intmul = even ? 1 : mul };
+
+/* macroes for initializing dither tables */
+#define d(n) (n)/double(mul) - 0.5
+#define i(n) even ? (n*in_max/mul - (int)in_max/2) \
+                  : (n*in_max - (int)mul*in_max/2)
+
+template<int m, int in_max = 255>
+struct QuantizeNoDither
+{
+    int res;
+    template<typename IntType>
+    QuantizeNoDither(IntType v) : res(v * m / in_max) { }
+    operator int() const { return res; }
+};
+
+template<int m, typename Base>
+struct QuantizeFuncBase: private Base
+{
+    int res;
+    
+    QuantizeFuncBase(size_t quant_pos, double v) : res(0)
+    {
+        if(v > 0.0)
+        {
+            const double dither_threshold = Base::flts[quant_pos];
+            res = (int)(v * (m / double(Base::maxin)) + dither_threshold);
+            if(res > m) res = m;
+        }
+    }
+    
+    QuantizeFuncBase(size_t quant_pos, unsigned char v) : res(v)
+    {
+        if(m == Base::maxin) return;
+        if(m < Base::maxin)
+        {
+            // With dithering
+            const int dither_threshold = Base::ints[quant_pos];
+            const int intmul = Base::intmul;
+            res = (res * (m * intmul) + dither_threshold) / (Base::maxin * intmul);
+        }
+        else
+        {
+            // Without dithering
+            res = QuantizeNoDither<m, Base::maxin> (res);
+        }
+    }
+};
+
+#define QuantizeFuncDecl(name, base) \
+  template<int m, int in_max=255> \
+  struct name: private QuantizeFuncBase<m, base<in_max> > \
+  { \
+      typedef QuantizeFuncBase<m, base<in_max> > Base; \
+      template<typename A, typename B> name(A a, B b) : Base(a, b) { } \
+      operator int() const { return Base::res; } \
+  }
+
+/******* Quantizing with 8x8 ordered dithering ********/
+template<int in_max> struct OrderedDither_8x8 { OrderedDitherDecl(8*8) };
+    template<int in_max>
+    const double OrderedDither_8x8<in_max>::flts[] /* A table for 8x8 ordered dithering */
+    = { d(1 ), d(49), d(13), d(61), d( 4), d(52), d(16), d(64),
+        d(33), d(17), d(45), d(29), d(36), d(20), d(48), d(32),
+        d(9 ), d(57), d( 5), d(53), d(12), d(60), d( 8), d(56),
+        d(41), d(25), d(37), d(21), d(44), d(28), d(40), d(24),
+        d(3 ), d(51), d(15), d(63), d( 2), d(50), d(14), d(62),
+        d(35), d(19), d(47), d(31), d(34), d(18), d(46), d(30),
+        d(11), d(59), d( 7), d(55), d(10), d(58), d( 6), d(54),
+        d(43), d(27), d(39), d(23), d(42), d(26), d(38), d(22) };
+    template<int in_max>
+    const int OrderedDither_8x8<in_max>::ints[]
+    = { i(1 ), i(49), i(13), i(61), i( 4), i(52), i(16), i(64),
+        i(33), i(17), i(45), i(29), i(36), i(20), i(48), i(32),
+        i(9 ), i(57), i( 5), i(53), i(12), i(60), i( 8), i(56),
+        i(41), i(25), i(37), i(21), i(44), i(28), i(40), i(24),
+        i(3 ), i(51), i(15), i(63), i( 2), i(50), i(14), i(62),
+        i(35), i(19), i(47), i(31), i(34), i(18), i(46), i(30),
+        i(11), i(59), i( 7), i(55), i(10), i(58), i( 6), i(54),
+        i(43), i(27), i(39), i(23), i(42), i(26), i(38), i(22) };
+QuantizeFuncDecl(Quantize8x8, OrderedDither_8x8);
+
+
+/******* Quantizing with 4x4 ordered dithering ********/
+template<int in_max> struct OrderedDither_4x4 { OrderedDitherDecl(4*4) };
+    template<int in_max>
+    const double OrderedDither_4x4<in_max>::flts[] /* A table for 4x4 ordered dithering */
+    = { d( 1), d( 9), d( 3), d(11),
+        d(13), d( 5), d(15), d( 7),
+        d( 4), d(12), d( 2), d(10),  
+        d(16), d( 8), d(14), d( 6) };
+    template<int in_max>
+    const int OrderedDither_4x4<in_max>::ints[]
+    = { i( 1), i( 9), i( 3), i(11),
+        i(13), i( 5), i(15), i( 7),
+        i( 4), i(12), i( 2), i(10),
+        i(16), i( 8), i(14), i( 6) };
+QuantizeFuncDecl(Quantize4x4, OrderedDither_4x4);
+
+/******* Quantizing with 3x3 ordered dithering ********/
+template<int in_max> struct OrderedDither_3x3 { OrderedDitherDecl(3*3) };
+    template<int in_max>
+    const double OrderedDither_3x3<in_max>::flts[] /* A table for 3x3 ordered dithering */
+    = { d(1), d(7), d(3),
+        d(6), d(4), d(9),
+        d(8), d(2), d(5) };
+    template<int in_max>
+    const int OrderedDither_3x3<in_max>::ints[]
+    = { i(1), i(7), i(3),
+        i(6), i(4), i(9),  
+        i(8), i(2), i(5) };
+QuantizeFuncDecl(Quantize3x3, OrderedDither_3x3);
+
+/******* Quantizing with 4x2 ordered dithering ********/
+template<int in_max> struct OrderedDither_4x2 { OrderedDitherDecl(4*2) };
+    template<int in_max>
+    const double OrderedDither_4x2<in_max>::flts[] /* A table for 4x2 ordered dithering */
+    = { d(1), d(5), d(2), d(6),
+        d(7), d(3), d(8), d(4) };
+    template<int in_max>
+    const int OrderedDither_4x2<in_max>::ints[]
+    = { i(1), i(5), i(2), i(6),
+        i(7), i(3), i(8), i(4) };
+QuantizeFuncDecl(Quantize4x2, OrderedDither_4x2);
+
+/******* Quantizing with 3x2 ordered dithering ********/
+template<int in_max> struct OrderedDither_3x2 { OrderedDitherDecl(3*2) };
+    template<int in_max>
+    const double OrderedDither_3x2<in_max>::flts[] /* A table for 3x2 ordered dithering */
+    = { d(1), d(5), d(3),
+        d(4), d(2), d(6) };
+    template<int in_max>
+    const int OrderedDither_3x2<in_max>::ints[]
+    = { i(1), i(5), i(3),
+        i(4), i(2), i(6) };
+QuantizeFuncDecl(Quantize3x2, OrderedDither_3x2);
+
+/******* Quantizing with 2x2 ordered dithering ********/
+template<int in_max> struct OrderedDither_2x2 { OrderedDitherDecl(2*2) };
+    template<int in_max>
+    const double OrderedDither_2x2<in_max>::flts[] /* A table for 2x2 ordered dithering */
+    = { d(1), d(4),
+        d(3), d(2) };
+    template<int in_max>
+    const int OrderedDither_2x2<in_max>::ints[]
+    = { i(1), i(4),
+        i(3), i(2) };
+QuantizeFuncDecl(Quantize2x2, OrderedDither_2x2);
+
+
+#undef OrderedDitherDecl
+#undef QuantizeFuncDecl
+#undef i
+#undef d
--- a/src/drivers/videolog/rgbtorgb.cpp
+++ b/src/drivers/videolog/rgbtorgb.cpp
--- a/src/drivers/videolog/rgbtorgb.h
+++ b/src/drivers/videolog/rgbtorgb.h
@ -0,0 +1,68 @@
+#ifdef __cplusplus
+extern "C" {
+  #define defaulttrue =true
+#else
+  #define defaulttrue
+  #define bool       int
+#endif
+
+/* RGB to RGB and RGB from/to YCbRr (YUV) conversions written by Bisqwit
+ * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+ *
+ * Concepts:
+ *   15 = RGB15 or BGR15
+ *   16 = RGB16 or BGR16
+ *   24 = RGB24 or BGR24
+ *   32 = RGB32 or BGR32
+ * I420 = YCbCr where Y is issued for each pixel,
+ *                    followed by Cr for 2x2 pixels,
+ *                    followed by Cb for 2x2 pixels
+ * YUY2 = YCbCr where for each pixel, Y is issued,
+ *                    followed by Cr for 2x1 pixels (if even pixel)
+ *                             or Cb for 2x1 pixels (if odd pixel)
+ *
+ * Note: Not all functions honor the swap_red_blue setting.
+ */
+
+void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels)
+    __attribute__((noinline));
+
+void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert16To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert15To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert16To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+void Convert_I420To24Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+void Convert_YUY2To24Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert15To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert16To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert24To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert32To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+#ifdef __cplusplus
+}
+  #undef defaulttrue
+#else
+  #undef defaulttrue
+  #undef bool
+#endif
--- a/src/drivers/videolog/simd.h
+++ b/src/drivers/videolog/simd.h
@ -0,0 +1,365 @@
+#if defined(__MMX__) && !defined(__x86_64)
+#define USE_MMX
+#endif
+#if defined(__SSE__)
+#define USE_SSE
+#endif
+
+/* SIMD interface (MMX) written by Bisqwit
+ * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+ */
+
+#ifdef __3dNOW__
+# include <mm3dnow.h> /* Note: not available on ICC */ 
+#elif defined(__MMX__)
+# include <mmintrin.h>
+#endif
+#ifdef __SSE__
+#include <xmmintrin.h>
+ #ifdef __ICC
+ typedef __m128 __v4sf;
+ #endif
+#endif
+
+struct c64_common
+{
+    static signed char clamp_s8(int_fast64_t v)
+        { return v<-128 ? -128 : (v > 127 ? 127 : v); }
+    static unsigned char clamp_u8(int_fast64_t v)
+        { return v<0 ? 0 : (v > 255 ? 255 : v); }
+    static short clamp_s16(int_fast64_t v)
+        { return v<-32768 ? -32768 : (v > 32767 ? 32767 : v); }
+
+    static inline uint_fast64_t expand32_8(uint_fast32_t a)
+    {
+        // 0000abcd -> 0a0b0c0d
+        typedef uint_fast64_t v;
+        return (a&0xFFU)
+            | ((a&0xFF00U)<<8)    // base: 8+8 = 16
+            | ((v)(a&0xFF0000U)<<16) // base: 16+16 = 32
+            | ((v)(a&0xFF000000UL)<<24); // base: 24+24 = 48
+    }
+    static inline uint_fast64_t expand32_16(uint_fast32_t a)
+    {
+        // 0000abcd -> 00ab00cd
+        typedef uint_fast64_t v;
+        return (a&0xFFFFU)
+         | ((v)(a&0xFFFF0000UL)<<16);   // base: 16+16 = 32
+    }
+};
+
+#ifdef __MMX__
+/* 64-bit integers that use MMX / 3Dnow operations where relevant */
+struct c64_MMX: public c64_common
+{
+    typedef c64_MMX c64;
+
+    __m64 value;
+    
+    inline c64_MMX() { }
+    inline c64_MMX(__m64 v) : value(v) { }
+    inline c64_MMX(const uint64_t& v) : value( *(const __m64*)& v) { }
+    inline c64_MMX(int v) : value(_m_from_int(v)) { }
+    inline c64_MMX(short a,short b,short c, short d)
+        : value(_mm_setr_pi16(a,b,c,d)) { }
+
+    inline c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
+    inline c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
+    c64& operator<<= (int n) { return *this = shl64(n); }
+    c64& operator>>= (int n) { return *this = shr64(n); }
+
+    c64 conv_s16_u8() const { return conv_s16_u8(*this); }
+    c64 conv_s16_s8() const { return conv_s16_s8(*this); }
+
+    void Get(const unsigned char* p)      { value = *(const __m64*)p; }
+    void Put(      unsigned char* p)const { *(__m64*)p =  value; }
+    
+    void Init16(short a,short b,short c, short d)
+        { value = _mm_setr_pi16(a,b,c,d); }
+    void Init16(short a)
+        { value = _mm_set1_pi16(a); }
+
+    void GetD(const unsigned char* p)      { value = *(const __m64*)p; }
+    
+    template<int n>
+    short Extract16() const { return ((const short*)&value)[n]; }
+    template<int n>
+    int Extract32() const { return ((const int*)&value)[n]; }
+    
+    short Extract88_from_1616lo() const
+    {
+        const unsigned char* data = (const unsigned char*)&value;
+        // bytes:  76543210
+        // shorts: 33221100
+        // take:        H L
+        return data[0] | *(short*)(data+1);
+        //return data[0] | ((*(const unsigned int*)data) >> 8);
+    }
+    short Extract88_from_1616hi() const
+    {
+        const unsigned char* data = 4+(const unsigned char*)&value;
+        // bytes:  76543210
+        // shorts: 33221100
+        // take:    H L
+        return data[0] | *(short*)(data+1);
+        //return data[0] | ((*(const unsigned int*)data) >> 8);
+    }
+    
+
+    c64& operator&= (const c64& b) { value=_mm_and_si64(value,b.value); return *this; }
+    c64& operator|= (const c64& b) { value=_mm_or_si64(value,b.value); return *this; }
+    c64& operator^= (const c64& b) { value=_mm_xor_si64(value,b.value); return *this; }
+    c64& operator+= (const c64& b) { return *this = *this + b; }
+    c64& operator-= (const c64& b) { return *this = *this - b; }
+    
+    c64 operator~ () const {
+        static const uint_least64_t negpat = ~(uint_least64_t)0;
+        return c64(_mm_xor_si64(value, *(const __m64*)&negpat));
+    }
+    
+            /* psllqi: p = packed
+                       s = shift
+                       r = right, l = left
+                       l = shift in zero, a = shift in sign bit
+                       q = 64-bit, d = 32-bit, w = 16-bit
+                      [i = immed amount]
+             */
+    c64 operator& (const c64& b) const { return c64(_mm_and_si64(value,b.value)); }
+    c64 operator| (const c64& b) const { return c64(_mm_or_si64(value,b.value)); }
+    c64 operator^ (const c64& b) const { return c64(_mm_xor_si64(value,b.value)); }
+    
+    c64 operator- (const c64& b) const
+    {
+        #ifdef __SSE2__
+        return _mm_sub_si64(value, b.value);
+        #else
+        return (const uint64_t&)value - (const uint64_t&)b.value;
+        #endif
+    }
+    c64 operator+ (const c64& b) const
+    {
+        #ifdef __SSE2__
+        return _mm_add_si64(value, b.value);
+        #else
+        return (const uint64_t&)value + (const uint64_t&)b.value;
+        #endif
+    }
+    
+
+    c64 shl64(int b) const { return _mm_slli_si64(value, b); }
+    c64 shr64(int b) const { return _mm_srli_si64(value, b); }
+    c64 shl16(int b) const { return _mm_slli_pi16(value, b); }
+    c64 shr16(int b) const { return _mm_srli_pi16(value, b); }
+    c64 sar32(int b) const { return _mm_srai_pi32(value, b); }
+    c64 sar16(int b) const { return _mm_srai_pi16(value, b); }
+    c64 add32(const c64& b) const { return _mm_add_pi32(value, b.value); }
+    c64 add16(const c64& b) const { return _mm_add_pi16(value, b.value); }
+    c64 sub32(const c64& b) const { return _mm_sub_pi32(value, b.value); }
+    c64 sub16(const c64& b) const { return _mm_sub_pi16(value, b.value); }
+    c64 mul16(const c64& b) const   { return _mm_mullo_pi16(value, b.value); }
+    c64 mul16hi(const c64& b) const { return _mm_mulhi_pi16(value, b.value); }
+    //c64 mul32(const c64& b) const { return _mm_mullo_pi32(value, b.value); }
+    c64 add8(const c64& b) const { return _mm_add_pi8(value, b.value); }
+    c64 sub8(const c64& b) const { return _mm_sub_pi8(value, b.value); }
+    
+    c64 unpacklbw(const c64& b) const { return _mm_unpacklo_pi8(b.value,value); }
+    c64 unpacklwd(const c64& b) const { return _mm_unpacklo_pi16(b.value,value); }
+    c64 unpackhbw(const c64& b) const { return _mm_unpackhi_pi8(b.value,value); }
+    c64 unpackhwd(const c64& b) const { return _mm_unpackhi_pi16(b.value,value); }
+    c64 unpackldq(const c64& b) const { return _mm_unpacklo_pi32(b.value,value); }
+    c64 unpackldq() const { return _mm_unpacklo_pi32(value,value); }
+
+    c64 operator& (const uint64_t& v) { return c64(_mm_and_si64(value, *(const __m64*)& v)); }
+    
+    c64 conv_s32_s16(const c64& b) const { return _mm_packs_pi32(value, b.value); }
+    c64 conv_s16_u8(const c64& b) const { return _mm_packs_pu16(value, b.value); }
+    c64 conv_s16_s8(const c64& b) const { return _mm_packs_pi16(value, b.value); }
+};
+#endif
+
+struct c64_nonMMX: public c64_common
+{
+    typedef c64_nonMMX c64;
+    
+    uint_least64_t value;
+    
+    inline c64_nonMMX() { }
+    inline c64_nonMMX(uint64_t v) : value(v) { }
+    inline c64_nonMMX(int v) : value(v) { }
+    inline c64_nonMMX(short a,short b,short c, short d)
+        { Init16(a,b,c,d); }
+
+    c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
+    c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
+    c64& operator<<= (int n) { return *this = shl64(n); }
+    c64& operator>>= (int n) { return *this = shr64(n); }
+
+    c64 conv_s16_u8() const { return conv_s16_u8(*this); }
+    c64 conv_s16_s8() const { return conv_s16_s8(*this); }
+
+    void Init16(short a,short b,short c, short d)
+        { uint_fast64_t aa = (unsigned short)a,
+                        bb = (unsigned short)b,
+                        cc = (unsigned short)c,
+                        dd = (unsigned short)d;
+          value = aa | (bb << 16) | (cc << 32) | (dd << 48); }
+    void Init16(short a)
+        { Init16(a,a,a,a); }
+    void Init8(unsigned char a,unsigned char b,unsigned char c,unsigned char d,
+               unsigned char e,unsigned char f,unsigned char g,unsigned char h)
+    {
+        value = ((uint_fast64_t)(a | (b << 8) | (c << 16) | (d << 24)))
+              | (((uint_fast64_t)e) << 32)
+              | (((uint_fast64_t)f) << 40)
+              | (((uint_fast64_t)g) << 48)
+              | (((uint_fast64_t)h) << 56);
+    }
+
+    void Get(const unsigned char* p)      { value = *(const uint_least64_t*)p; }
+    void Put(      unsigned char* p)const { *(uint_least64_t*)p =  value; }
+    
+    c64& operator&= (const c64& b) { value&=b.value; return *this; }
+    c64& operator|= (const c64& b) { value|=b.value; return *this; }
+    c64& operator^= (const c64& b) { value^=b.value; return *this; }
+    c64& operator+= (const c64& b) { value+=b.value; return *this; }
+    c64& operator-= (const c64& b) { value-=b.value; return *this; }
+    c64 operator& (const c64& b) const { return value & b.value; }
+    c64 operator| (const c64& b) const { return value | b.value; }
+    c64 operator^ (const c64& b) const { return value ^ b.value; }
+    c64 operator- (const c64& b) const { return value - b.value; }
+    c64 operator+ (const c64& b) const { return value + b.value; }
+
+    c64 operator& (uint_fast64_t b) const { return value & b; }
+
+    c64 operator~ () const { return ~value; }
+    
+    #define usimdsim(type, count, op) \
+        type* p = (type*)&res.value; \
+        for(int n=0; n<count; ++n) p[n] = (p[n] op b)
+
+    #define simdsim(type, count, op) \
+        type* p = (type*)&res.value; \
+        const type* o = (const type*)&b.value; \
+        for(int n=0; n<count; ++n) p[n] = (p[n] op o[n])
+    
+    c64 shl64(int b) const { return value << b; }
+    c64 shr64(int b) const { return value >> b; }
+    c64 shl16(int b) const { c64 res = *this; usimdsim(short, 2, <<); return res; }
+    c64 shr16(int b) const { c64 res = *this; usimdsim(unsigned short, 2, >>); return res; }
+    c64 sar32(int b) const { c64 res = *this; usimdsim(int, 2, >>); return res; }
+    c64 sar16(int b) const { c64 res = *this; usimdsim(short, 2, >>); return res; }
+
+    c64 add16(const c64& b) const { c64 res = *this; simdsim(short, 4, +); return res; }
+    c64 sub16(const c64& b) const { c64 res = *this; simdsim(short, 4, -); return res; }
+    c64 add32(const c64& b) const { c64 res = *this; simdsim(int,   2, +); return res; }
+    c64 sub32(const c64& b) const { c64 res = *this; simdsim(int,   2, -); return res; }
+    c64 mul16(const c64& b) const { c64 res = *this; simdsim(short, 4, *); return res; }
+    c64 mul16hi(const c64& b) const { c64 res = *this; simdsim(short, 4, *) >> 16; return res; }
+    c64 add8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, +); return res; }
+    c64 sub8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, -); return res; }
+    
+    #undef simdsim
+    #undef usimdsim
+    
+    c64 conv_s32_s16(const c64& b) const
+    {
+        c64 res; res.
+        Init16(clamp_s16(value & 0xFFFFFFFFU),
+               clamp_s16(value >> 32),
+               clamp_s16(b.value & 0xFFFFFFFFU),
+               clamp_s16(b.value >> 32));
+        return res;
+    }
+    c64 conv_s16_u8(const c64& b) const
+    {
+        c64 res; res.
+        Init8(clamp_u8(value & 0xFFFF),
+              clamp_u8((value >> 16) & 0xFFFF),
+              clamp_u8((value >> 32) & 0xFFFF),
+              clamp_u8((value >> 48) & 0xFFFF),
+              clamp_u8(b.value & 0xFFFF),
+              clamp_u8((b.value >> 16) & 0xFFFF),
+              clamp_u8((b.value >> 32) & 0xFFFF),
+              clamp_u8((b.value >> 48) & 0xFFFF));
+        return res;
+    }
+    c64 conv_s16_s8(const c64& b) const
+    {
+        c64 res; res.
+        Init8(clamp_s8(value & 0xFFFF),
+              clamp_s8((value >> 16) & 0xFFFF),
+              clamp_s8((value >> 32) & 0xFFFF),
+              clamp_s8((value >> 48) & 0xFFFF),
+              clamp_s8(b.value & 0xFFFF),
+              clamp_s8((b.value >> 16) & 0xFFFF),
+              clamp_s8((b.value >> 32) & 0xFFFF),
+              clamp_s8((b.value >> 48) & 0xFFFF));
+        return res;
+    }
+
+    /* TODO: Verify that these are correct (though they should never be used anyway) */
+    c64 unpacklbw(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        /* ICC says [error: type of cast must be integral or enum]
+         * on the return value cast,
+         * so we cannot use this code on ICC. Fine for GCC. */
+        return (uint_least64_t)_m_punpcklbw(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_8(a) | (expand32_8(b) << 8);
+    #endif
+    }
+    c64 unpackhbw(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpckhbw(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_8(a>>32) | (expand32_8(b>>32) << 8);
+    #endif
+    }
+    c64 unpacklwd(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpcklwd(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_16(a) | (expand32_16(b) << 16);
+    #endif
+    }
+    c64 unpackhwd(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpckhwd(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_16(a>>32) | (expand32_16(b>>32) << 16);
+    #endif
+    }
+    c64 unpackldq() const { return unpackldq(*this); }
+    c64 unpackldq(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpckldq(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        return value | (p.value << 32);
+    #endif
+    }
+};
+
+#ifdef USE_MMX
+typedef c64_MMX c64;
+#else
+typedef c64_nonMMX c64;
+#endif
+
+static inline void MMX_clear()
+{
+    #ifdef __3dNOW__
+    _m_femms(); /* Note: not available on ICC or Valgrind */
+    //_mm_empty();
+    #elif defined(__MMX__)
+    _mm_empty();
+    #endif
+}
--- a/src/movie.cpp
+++ b/src/movie.cpp
@ -25,6 +25,10 @@
 #include "utils/memorystream.h"
 #include "utils/xstring.h"

+#ifdef CREATE_AVI
+#include "drivers/videolog/nesvideos-piece.h"
+#endif
+
 #ifdef WIN32
 #include <windows.h>
 #endif
@ -61,7 +65,7 @@ EMOVIEMODE movieMode = MOVIEMODE_INACTIVE;

 //this should not be set unless we are in MOVIEMODE_RECORD!
 //FILE* fpRecordingMovie = 0;
-fstream* osRecordingMovie = 0;
+std::ostream* osRecordingMovie = 0;

 int currFrameCounter;
 uint32 cur_input_display = 0;
@ -69,6 +73,7 @@ int pauseframe = -1;
 bool movie_readonly = true;
 int input_display = 0;
 int frame_display = 0;
+int last_displayed_framenumber = -1;

 SFORMAT FCEUMOV_STATEINFO[]={
 	{ &currFrameCounter, 4|FCEUSTATE_RLSB, "FCNT"},
@ -305,8 +310,8 @@ MovieData::MovieData()
 	: version(MOVIE_VERSION)
 	, emuVersion(FCEU_VERSION_NUMERIC)
 	, palFlag(false)
-	, binaryFlag(false)
 	, rerecordCount(1)
+	, binaryFlag(false)
 	, greenZoneCount(0)
 {
 	memset(&romChecksum,0,sizeof(MD5DATA));
@ -760,6 +765,14 @@ void FCEUI_LoadMovie(const char *fname, bool _read_only, bool tasedit, int _paus
 		else
 			FCEU_DispMessage("Replay started Read+Write.");
 	}
+	
+	#ifdef CREATE_AVI
+	if(LoggingEnabled)
+	{
+	    FCEU_DispMessage("Video recording enabled.\n");
+	    LoggingEnabled = 2;
+	}
+	#endif
 }

 static void openRecordingMovie(const char* fname)
@ -916,8 +929,12 @@ void FCEUMOV_AddCommand(int cmd)

 void FCEU_DrawMovies(uint8 *XBuf)
 {
-	if(frame_display)
+	if(frame_display
+	&& movieMode != MOVIEMODE_INACTIVE
+	&& currFrameCounter != last_displayed_framenumber)
 	{
+		last_displayed_framenumber = currFrameCounter;
+		
 		char counterbuf[32] = {0};
 		if(movieMode == MOVIEMODE_PLAY)
 			sprintf(counterbuf,"%d/%d",currFrameCounter,currMovieData.records.size());
--- a/src/video.cpp
+++ b/src/video.cpp
@ -50,6 +50,10 @@
 #include "fceulua.h"
 #endif

+#ifdef CREATE_AVI
+#include "drivers/videolog/nesvideos-piece.h"
+#endif
+
 uint8 *XBuf=NULL;
 uint8 *XBackBuf=NULL;
 static uint8 *xbsave=NULL;
@ -351,6 +355,17 @@ void FCEU_DispMessage(char *format, ...)

 	guiMessage.howlong = 180;
 	guiMessage.isMovieMessage = false;
+	
+	#ifdef CREATE_AVI
+	if(LoggingEnabled == 2)
+	{
+		/* While in AVI recording mode, only display bare minimum
+		 * of messages
+		 */
+		if(strcmp(guiMessage.errmsg, "Movie playback stopped.") != 0)
+			guiMessage.howlong = 0;
+	}
+	#endif
 }

 void FCEU_ResetMessages()