win32: start converting filters to 32bit

2009-08-10 02:30:32 +00:00 · 2009-08-10 02:30:32 +00:00 · 88f0549e85
parent c4be07bc87
commit 88f0549e85
8 changed files with 4184 additions and 1158 deletions
--- a/desmume/src/windows/filter/2xsai.cpp
+++ b/desmume/src/windows/filter/2xsai.cpp
--- a/desmume/src/windows/filter/bilinear.cpp
+++ b/desmume/src/windows/filter/bilinear.cpp
@ -6,13 +6,15 @@

 #include "types.h"

-int systemRedShift    = 10;
-int systemGreenShift  = 0;
-int systemBlueShift   = 5;
-
+int systemRedShift    = 24;
+int systemGreenShift  = 16;
+int systemBlueShift   = 8;
+/*
 #define RGB1(r,g,b) ((r)>>3) << systemRedShift |\
  ((g) >> 3) << systemGreenShift |\
  ((b) >> 3) << systemBlueShift\
+*/
+#define RGB1(r,g,b)(((r))<<systemRedShift) | (((g)) << systemGreenShift) | (((b)) << systemBlueShift) | ((255) << 0)

 static void fill_rgb_row_16(u16 *from, int src_width, u8 *row, int width)
 {
@ -37,6 +39,29 @@ static void fill_rgb_row_16(u16 *from, int src_width, u8 *row, int width)
  }
 }

+static void fill_rgb_row_32(u32 *from, int src_width, u8 *row, int width)
+{
+  u8 *copy_start = row + src_width*3;
+  u8 *all_stop = row + width*3;
+  while (row < copy_start) {
+    u32 color = *from++;
+    *row++ = ((color >> systemRedShift) & 0x1f) << 3;
+    *row++ = ((color >> systemGreenShift) & 0x1f) << 3;
+    *row++ = ((color >> systemBlueShift) & 0x1f) << 3;
+  }
+  // any remaining elements to be written to 'row' are a replica of the
+  // preceding pixel
+  u8 *p = row-3;
+  while (row < all_stop) {
+    // we're guaranteed three elements per pixel; could unroll the loop
+    // further, especially with a Duff's Device, but the gains would be
+    // probably limited (judging by profiler output)
+    *row++ = *p++;
+    *row++ = *p++;
+    *row++ = *p++;
+  }
+}
+
 void Bilinear(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
              u8 *dstPtr, u32 dstPitch, int width, int height)
 {
@ -221,6 +246,188 @@ void BilinearPlus(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
  }
 }

+void Bilinear32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
+                u8 *dstPtr, u32 dstPitch, int width, int height)
+{
+  u8 row_cur[3*322];
+  u8 row_next[3*322];
+  u8 *rgb_row_cur = row_cur;
+  u8 *rgb_row_next = row_next;
+
+  u32 *to = (u32 *)dstPtr;
+  u32 *to_odd = (u32 *)(dstPtr + dstPitch);
+
+  int from_width = width;
+
+  u32 *from = (u32 *)srcPtr;
+  fill_rgb_row_32(from, from_width, rgb_row_cur, width+1);
+
+  for(int y = 0; y < height; y++) {
+    u32 *from_orig = from;
+    u32 *to_orig = to;
+
+    if (y+1 < height)
+      fill_rgb_row_32(from+width+1, from_width, rgb_row_next,
+                   width+1);
+    else
+      fill_rgb_row_32(from, from_width, rgb_row_next, width+1);
+
+    // every pixel in the src region, is extended to 4 pixels in the
+    // destination, arranged in a square 'quad'; if the current src
+    // pixel is 'a', then in what follows 'b' is the src pixel to the
+    // right, 'c' is the src pixel below, and 'd' is the src pixel to
+    // the right and down
+    u8 *cur_row  = rgb_row_cur;
+    u8 *next_row = rgb_row_next;
+    u8 *ar = cur_row++;
+    u8 *ag = cur_row++;
+    u8 *ab = cur_row++;
+    u8 *cr = next_row++;
+    u8 *cg = next_row++;
+    u8 *cb = next_row++;
+    for(int x=0; x < width; x++) {
+      u8 *br = cur_row++;
+      u8 *bg = cur_row++;
+      u8 *bb = cur_row++;
+      u8 *dr = next_row++;
+      u8 *dg = next_row++;
+      u8 *db = next_row++;
+
+      // upper left pixel in quad: just copy it in
+	  int m = *ar;
+	  int mm = *ag;
+	  int mmmm = *ab;
+	  int mmm =  RGB1(*ar, *ag, *ab);
+      *to++ = RGB1(*ar, *ag, *ab);
+
+      // upper right
+      *to++ = RGB1((*ar+*br)>>1, (*ag+*bg)>>1, (*ab+*bb)>>1);
+
+      // lower left
+      *to_odd++ = RGB1((*ar+*cr)>>1, (*ag+*cg)>>1, (*ab+*cb)>>1);
+
+      // lower right
+      *to_odd++ = RGB1((*ar+*br+*cr+*dr)>>2,
+                      (*ag+*bg+*cg+*dg)>>2,
+                      (*ab+*bb+*cb+*db)>>2);
+
+      // 'b' becomes 'a', 'd' becomes 'c'
+      ar = br;
+      ag = bg;
+      ab = bb;
+      cr = dr;
+      cg = dg;
+      cb = db;
+    }
+
+    // the "next" rgb row becomes the current; the old current rgb row is
+    // recycled and serves as the new "next" row
+    u8 *temp;
+    temp = rgb_row_cur;
+    rgb_row_cur = rgb_row_next;
+    rgb_row_next = temp;
+
+    // update the pointers for start of next pair of lines
+    from = (u32 *)((u8 *)from_orig + srcPitch);
+    to = (u32 *)((u8 *)to_orig + (dstPitch << 1));
+    to_odd = (u32 *)((u8 *)to + dstPitch);
+  }
+}
+
+void BilinearPlus32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
+                    u8 *dstPtr, u32 dstPitch, int width, int height)
+{
+  u8 row_cur[3*322];
+  u8 row_next[3*322];
+  u8 *rgb_row_cur = row_cur;
+  u8 *rgb_row_next = row_next;
+
+  u32 *to = (u32 *)dstPtr;
+  u32 *to_odd = (u32 *)(dstPtr + dstPitch);
+
+  int from_width = width;
+
+  u32 *from = (u32 *)srcPtr;
+  fill_rgb_row_32(from, from_width, rgb_row_cur, width+1);
+
+  for(int y = 0; y < height; y++) {
+    u32 *from_orig = from;
+    u32 *to_orig = to;
+
+    if (y+1 < height)
+      fill_rgb_row_32(from+width+1, from_width, rgb_row_next,
+                   width+1);
+    else
+      fill_rgb_row_32(from, from_width, rgb_row_next, width+1);
+
+    // every pixel in the src region, is extended to 4 pixels in the
+    // destination, arranged in a square 'quad'; if the current src
+    // pixel is 'a', then in what follows 'b' is the src pixel to the
+    // right, 'c' is the src pixel below, and 'd' is the src pixel to
+    // the right and down
+    u8 *cur_row  = rgb_row_cur;
+    u8 *next_row = rgb_row_next;
+    u8 *ar = cur_row++;
+    u8 *ag = cur_row++;
+    u8 *ab = cur_row++;
+    u8 *cr = next_row++;
+    u8 *cg = next_row++;
+    u8 *cb = next_row++;
+    for(int x=0; x < width; x++) {
+      u8 *br = cur_row++;
+      u8 *bg = cur_row++;
+      u8 *bb = cur_row++;
+      u8 *dr = next_row++;
+      u8 *dg = next_row++;
+      u8 *db = next_row++;
+
+      // upper left pixel in quad: just copy it in
+      //*to++ = manip.rgb(*ar, *ag, *ab);
+#ifdef USE_ORIGINAL_BILINEAR_PLUS
+      *to++ = RGB(
+                  (((*ar)<<2) +((*ar)) + (*cr+*br+*br) )>> 3,
+                  (((*ag)<<2) +((*ag)) + (*cg+*bg+*bg) )>> 3,
+                  (((*ab)<<2) +((*ab)) + (*cb+*bb+*bb) )>> 3);
+#else
+      *to++ = RGB1(
+                  (((*ar)<<3) +((*ar)<<1) + (*cr+*br+*br+*cr) )>> 4,
+                  (((*ag)<<3) +((*ag)<<1) + (*cg+*bg+*bg+*cg) )>> 4,
+                  (((*ab)<<3) +((*ab)<<1) + (*cb+*bb+*bb+*cb) )>> 4);
+#endif
+
+      // upper right
+      *to++ = RGB1((*ar+*br)>>1, (*ag+*bg)>>1, (*ab+*bb)>>1);
+
+      // lower left
+      *to_odd++ = RGB1((*ar+*cr)>>1, (*ag+*cg)>>1, (*ab+*cb)>>1);
+
+      // lower right
+      *to_odd++ = RGB1((*ar+*br+*cr+*dr)>>2,
+                      (*ag+*bg+*cg+*dg)>>2,
+                      (*ab+*bb+*cb+*db)>>2);
+
+      // 'b' becomes 'a', 'd' becomes 'c'
+      ar = br;
+      ag = bg;
+      ab = bb;
+      cr = dr;
+      cg = dg;
+      cb = db;
+    }
+
+    // the "next" rgb row becomes the current; the old current rgb row is
+    // recycled and serves as the new "next" row
+    u8 *temp;
+    temp = rgb_row_cur;
+    rgb_row_cur = rgb_row_next;
+    rgb_row_next = temp;
+
+    // update the pointers for start of next pair of lines
+    from = (u32 *)((u8 *)from_orig + srcPitch);
+    to = (u32 *)((u8 *)to_orig + (dstPitch << 1));
+    to_odd = (u32 *)((u8 *)to + dstPitch);
+  }
+}
 void RenderBilinear (SSurface Src, SSurface Dst)
 {

@ -229,8 +436,8 @@ void RenderBilinear (SSurface Src, SSurface Dst)
    lpSrc = Src.Surface;
    lpDst = Dst.Surface;

-    Bilinear (lpSrc, Src.Pitch,
+    Bilinear32 (lpSrc, Src.Pitch*2,
                lpSrc,
-                lpDst, Dst.Pitch, Src.Width, Src.Height);
-
+                lpDst, Dst.Pitch*2, Src.Width, Src.Height);
 }
+
--- a/desmume/src/windows/filter/hq2x.cpp
+++ b/desmume/src/windows/filter/hq2x.cpp
--- a/desmume/src/windows/filter/hq2x.h
+++ b/desmume/src/windows/filter/hq2x.h
--- a/desmume/src/windows/filter/interp.h
+++ b/desmume/src/windows/filter/interp.h
@ -0,0 +1,355 @@
+/*
+ * This file is part of the Advance project.
+ *
+ * Copyright (C) 2003 Andrea Mazzoleni
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * In addition, as a special exception, Andrea Mazzoleni
+ * gives permission to link the code of this program with
+ * the MAME library (or with modified versions of MAME that use the
+ * same license as MAME), and distribute linked combinations including
+ * the two.  You must obey the GNU General Public License in all
+ * respects for all of the code used other than MAME.  If you modify
+ * this file, you may extend this exception to your version of the
+ * file, but you are not obligated to do so.  If you do not wish to
+ * do so, delete this exception statement from your version.
+ */
+
+#ifndef __INTERP_H
+#define __INTERP_H
+
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+
+/***************************************************************************/
+/* Basic types */
+
+/***************************************************************************/
+/* interpolation */
+
+extern unsigned interp_mask[2];
+extern unsigned interp_bits_per_pixel;
+
+#define INTERP_16_MASK_1(v) (v & interp_mask[0])
+#define INTERP_16_MASK_2(v) (v & interp_mask[1])
+
+static inline u16 interp_16_521(u16 p1, u16 p2, u16 p3)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*5 + INTERP_16_MASK_1(p2)*2 + INTERP_16_MASK_1(p3)*1) / 8)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*5 + INTERP_16_MASK_2(p2)*2 + INTERP_16_MASK_2(p3)*1) / 8);
+}
+
+static inline u16 interp_16_332(u16 p1, u16 p2, u16 p3)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*3 + INTERP_16_MASK_1(p2)*3 + INTERP_16_MASK_1(p3)*2) / 8)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*3 + INTERP_16_MASK_2(p2)*3 + INTERP_16_MASK_2(p3)*2) / 8);
+}
+
+static inline u16 interp_16_611(u16 p1, u16 p2, u16 p3)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*6 + INTERP_16_MASK_1(p2) + INTERP_16_MASK_1(p3)) / 8)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*6 + INTERP_16_MASK_2(p2) + INTERP_16_MASK_2(p3)) / 8);
+}
+
+static inline u16 interp_16_71(u16 p1, u16 p2)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*7 + INTERP_16_MASK_1(p2)) / 8)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*7 + INTERP_16_MASK_2(p2)) / 8);
+}
+
+static inline u16 interp_16_211(u16 p1, u16 p2, u16 p3)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*2 + INTERP_16_MASK_1(p2) + INTERP_16_MASK_1(p3)) / 4)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*2 + INTERP_16_MASK_2(p2) + INTERP_16_MASK_2(p3)) / 4);
+}
+
+static inline u16 interp_16_772(u16 p1, u16 p2, u16 p3)
+{
+  return INTERP_16_MASK_1(((INTERP_16_MASK_1(p1) + INTERP_16_MASK_1(p2))*7 + INTERP_16_MASK_1(p3)*2) / 16)
+    | INTERP_16_MASK_2(((INTERP_16_MASK_2(p1) + INTERP_16_MASK_2(p2))*7 + INTERP_16_MASK_2(p3)*2) / 16);
+}
+
+static inline u16 interp_16_11(u16 p1, u16 p2)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1) + INTERP_16_MASK_1(p2)) / 2)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1) + INTERP_16_MASK_2(p2)) / 2);
+}
+
+static inline u16 interp_16_31(u16 p1, u16 p2)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*3 + INTERP_16_MASK_1(p2)) / 4)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*3 + INTERP_16_MASK_2(p2)) / 4);
+}
+
+static inline u16 interp_16_1411(u16 p1, u16 p2, u16 p3)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*14 + INTERP_16_MASK_1(p2) + INTERP_16_MASK_1(p3)) / 16)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*14 + INTERP_16_MASK_2(p2) + INTERP_16_MASK_2(p3)) / 16);
+}
+
+static inline u16 interp_16_431(u16 p1, u16 p2, u16 p3)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*4 + INTERP_16_MASK_1(p2)*3 + INTERP_16_MASK_1(p3)) / 8)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*4 + INTERP_16_MASK_2(p2)*3 + INTERP_16_MASK_2(p3)) / 8);
+}
+
+static inline u16 interp_16_53(u16 p1, u16 p2)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*5 + INTERP_16_MASK_1(p2)*3) / 8)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*5 + INTERP_16_MASK_2(p2)*3) / 8);
+}
+
+static inline u16 interp_16_151(u16 p1, u16 p2)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*15 + INTERP_16_MASK_1(p2)) / 16)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*15 + INTERP_16_MASK_2(p2)) / 16);
+}
+
+static inline u16 interp_16_97(u16 p1, u16 p2)
+{
+  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*9 + INTERP_16_MASK_1(p2)*7) / 16)
+    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*9 + INTERP_16_MASK_2(p2)*7) / 16);
+}
+
+#define INTERP_32_MASK_1(v) (v & 0xFF00FF)
+#define INTERP_32_MASK_2(v) (v & 0x00FF00)
+
+static inline u32 interp_32_521(u32 p1, u32 p2, u32 p3)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*5 + INTERP_32_MASK_1(p2)*2 + INTERP_32_MASK_1(p3)*1) / 8)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*5 + INTERP_32_MASK_2(p2)*2 + INTERP_32_MASK_2(p3)*1) / 8);
+}
+
+static inline u32 interp_32_332(u32 p1, u32 p2, u32 p3)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*3 + INTERP_32_MASK_1(p2)*3 + INTERP_32_MASK_1(p3)*2) / 8)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*3 + INTERP_32_MASK_2(p2)*3 + INTERP_32_MASK_2(p3)*2) / 8);
+}
+
+static inline u32 interp_32_211(u32 p1, u32 p2, u32 p3)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*2 + INTERP_32_MASK_1(p2) + INTERP_32_MASK_1(p3)) / 4)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*2 + INTERP_32_MASK_2(p2) + INTERP_32_MASK_2(p3)) / 4);
+}
+
+static inline u32 interp_32_611(u32 p1, u32 p2, u32 p3)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*6 + INTERP_32_MASK_1(p2) + INTERP_32_MASK_1(p3)) / 8)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*6 + INTERP_32_MASK_2(p2) + INTERP_32_MASK_2(p3)) / 8);
+}
+
+static inline u32 interp_32_71(u32 p1, u32 p2)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*7 + INTERP_32_MASK_1(p2)) / 8)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*7 + INTERP_32_MASK_2(p2)) / 8);
+}
+
+static inline u32 interp_32_772(u32 p1, u32 p2, u32 p3)
+{
+  return INTERP_32_MASK_1(((INTERP_32_MASK_1(p1) + INTERP_32_MASK_1(p2))*7 + INTERP_32_MASK_1(p3)*2) / 16)
+    | INTERP_32_MASK_2(((INTERP_32_MASK_2(p1) + INTERP_32_MASK_2(p2))*7 + INTERP_32_MASK_2(p3)*2) / 16);
+}
+
+static inline u32 interp_32_11(u32 p1, u32 p2)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1) + INTERP_32_MASK_1(p2)) / 2)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1) + INTERP_32_MASK_2(p2)) / 2);
+}
+
+static inline u32 interp_32_31(u32 p1, u32 p2)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*3 + INTERP_32_MASK_1(p2)) / 4)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*3 + INTERP_32_MASK_2(p2)) / 4);
+}
+
+static inline u32 interp_32_1411(u32 p1, u32 p2, u32 p3)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*14 + INTERP_32_MASK_1(p2) + INTERP_32_MASK_1(p3)) / 16)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*14 + INTERP_32_MASK_2(p2) + INTERP_32_MASK_2(p3)) / 16);
+}
+
+static inline u32 interp_32_431(u32 p1, u32 p2, u32 p3)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*4 + INTERP_32_MASK_1(p2)*3 + INTERP_32_MASK_1(p3)) / 8)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*4 + INTERP_32_MASK_2(p2)*3 + INTERP_32_MASK_2(p3)) / 8);
+}
+
+static inline u32 interp_32_53(u32 p1, u32 p2)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*5 + INTERP_32_MASK_1(p2)*3) / 8)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*5 + INTERP_32_MASK_2(p2)*3) / 8);
+}
+
+static inline u32 interp_32_151(u32 p1, u32 p2)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*15 + INTERP_32_MASK_1(p2)) / 16)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*15 + INTERP_32_MASK_2(p2)) / 16);
+}
+
+static inline u32 interp_32_97(u32 p1, u32 p2)
+{
+  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*9 + INTERP_32_MASK_1(p2)*7) / 16)
+    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*9 + INTERP_32_MASK_2(p2)*7) / 16);
+}
+
+/***************************************************************************/
+/* diff */
+
+#define INTERP_Y_LIMIT (0x30*4)
+#define INTERP_U_LIMIT (0x07*4)
+#define INTERP_V_LIMIT (0x06*8)
+
+static int interp_16_diff(u16 p1, u16 p2)
+{
+  int r, g, b;
+  int y, u, v;
+
+  if (p1 == p2)
+    return 0;
+
+  if (interp_bits_per_pixel == 16) {
+    b = (int)((p1 & 0x1F) - (p2 & 0x1F)) << 3;
+    g = (int)((p1 & 0x7E0) - (p2 & 0x7E0)) >> 3;
+    r = (int)((p1 & 0xF800) - (p2 & 0xF800)) >> 8;
+  } else {
+    b = (int)((p1 & 0x1F) - (p2 & 0x1F)) << 3;
+    g = (int)((p1 & 0x3E0) - (p2 & 0x3E0)) >> 2;
+    r = (int)((p1 & 0x7C00) - (p2 & 0x7C00)) >> 7;
+  }
+
+  y = r + g + b;
+  u = r - b;
+  v = -r + 2*g - b;
+
+  if (y < -INTERP_Y_LIMIT || y > INTERP_Y_LIMIT)
+    return 1;
+
+  if (u < -INTERP_U_LIMIT || u > INTERP_U_LIMIT)
+    return 1;
+
+  if (v < -INTERP_V_LIMIT || v > INTERP_V_LIMIT)
+    return 1;
+
+return 0;
+}
+
+static int interp_32_diff(u32 p1, u32 p2)
+{
+  int r, g, b;
+  int y, u, v;
+
+  if ((p1 & 0xF8F8F8) == (p2 & 0xF8F8F8))
+    return 0;
+
+  b = (int)((p1 & 0xFF) - (p2 & 0xFF));
+  g = (int)((p1 & 0xFF00) - (p2 & 0xFF00)) >> 8;
+  r = (int)((p1 & 0xFF0000) - (p2 & 0xFF0000)) >> 16;
+
+  y = r + g + b;
+  u = r - b;
+  v = -r + 2*g - b;
+
+  if (y < -INTERP_Y_LIMIT || y > INTERP_Y_LIMIT)
+    return 1;
+
+  if (u < -INTERP_U_LIMIT || u > INTERP_U_LIMIT)
+    return 1;
+
+  if (v < -INTERP_V_LIMIT || v > INTERP_V_LIMIT)
+    return 1;
+
+  return 0;
+}
+
+
+#define INTERP_LIMIT2 (96000)
+#define ABS(x) ((x) < 0 ? -(x) : (x))
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+
+static int interp_16_diff2(u16 p1, u16 p2)
+{
+  int r, g, b;
+  int y, u, v;
+
+  if ((p1 & 0xF79E) == (p2 & 0xF79E))
+    return 0;
+
+  if (interp_bits_per_pixel == 16) {
+    b = (int)((p1 & 0x1F) - (p2 & 0x1F)) << 3;
+    g = (int)((p1 & 0x7E0) - (p2 & 0x7E0)) >> 3;
+    r = (int)((p1 & 0xF800) - (p2 & 0xF800)) >> 8;
+  } else {
+    b = (int)((p1 & 0x1F) - (p2 & 0x1F)) << 3;
+    g = (int)((p1 & 0x3E0) - (p2 & 0x3E0)) >> 2;
+    r = (int)((p1 & 0x7C00) - (p2 & 0x7C00)) >> 7;
+  }
+
+//  yb =  30*r + 58*g + 12*b;
+  y =  33*r + 36*g + 31*b;
+  u = -14*r - 29*g + 44*b;
+  v =  62*r - 51*g - 10*b;
+
+  if (11*ABS(y) + 8*ABS(u) + 6*ABS(v) > INTERP_LIMIT2)
+    return 1;
+  return 0;
+}
+
+static int interp_32_diff2(u32 p1, u32 p2)
+{
+  int r, g, b;
+  int y, u, v;
+
+  if ((p1 & 0xF0F0F0) == (p2 & 0xF0F0F0))
+    return 0;
+
+  b = (int)((p1 & 0xF8) - (p2 & 0xF8));
+  g = (int)((p1 & 0xF800) - (p2 & 0xF800)) >> 8;
+  r = (int)((p1 & 0xF80000) - (p2 & 0xF80000)) >> 16;
+
+//  y =  30*r + 58*g + 12*b;
+  y =  33*r + 36*g + 31*b;
+  u = -14*r - 29*g + 44*b;
+  v =  62*r - 51*g - 10*b;
+
+  if (11*ABS(y) + 8*ABS(u) + 6*ABS(v) > INTERP_LIMIT2)
+    return 1;
+
+  return 0;
+}
+
+static void interp_set(unsigned bits_per_pixel)
+{
+  interp_bits_per_pixel = bits_per_pixel;
+
+  switch (bits_per_pixel) {
+  case 15 :
+    interp_mask[0] = 0x7C1F;
+    interp_mask[1] = 0x03E0;
+    break;
+  case 16 :
+    interp_mask[0] = 0xF81F;
+    interp_mask[1] = 0x07E0;
+    break;
+  case 32 :
+    interp_mask[0] = 0xFF00FF;
+    interp_mask[1] = 0x00FF00;
+    break;
+  }
+}
+
+#endif
--- a/desmume/src/windows/filter/scanline.cpp
+++ b/desmume/src/windows/filter/scanline.cpp
@ -42,7 +42,7 @@ FORCEINLINE void ScanLine16_2( uint16 *lpDst, uint16 *lpSrc, unsigned int Width)
 	}
 }

-FORCEINLINE void DoubleLine16( uint16 *lpDst, uint16 *lpSrc, unsigned int Width){
+FORCEINLINE void DoubleLine32( uint32 *lpDst, uint32 *lpSrc, unsigned int Width){
 	while(Width--){
 		*lpDst++ = *lpSrc;
 		*lpDst++ = *lpSrc++;
@ -69,17 +69,17 @@ void RenderScanline( SSurface Src, SSurface Dst)

 void RenderNearest2X (SSurface Src, SSurface Dst)
 {
-	uint16 *lpSrc;
+	uint32 *lpSrc;
 	unsigned int H;

 	const uint32 srcHeight = Src.Height;

 	const unsigned int srcPitch = Src.Pitch >> 1;
-	lpSrc = reinterpret_cast<uint16 *>(Src.Surface);
+	lpSrc = reinterpret_cast<uint32 *>(Src.Surface);

 	const unsigned int dstPitch = Dst.Pitch >> 1;
-	uint16 *lpDst = (uint16*)Dst.Surface;
+	uint32 *lpDst = (uint32*)Dst.Surface;
 	for (H = 0; H < srcHeight; H++, lpSrc += srcPitch)
-		DoubleLine16 (lpDst, lpSrc, Src.Width), lpDst += dstPitch,
-		DoubleLine16 (lpDst, lpSrc, Src.Width), lpDst += dstPitch;
+		DoubleLine32 (lpDst, lpSrc, Src.Width), lpDst += dstPitch,
+		DoubleLine32 (lpDst, lpSrc, Src.Width), lpDst += dstPitch;
 }
--- a/desmume/src/windows/main.cpp
+++ b/desmume/src/windows/main.cpp
@ -720,7 +720,7 @@ template<typename T, int bpp> static void doRotate(void* dst)
 {
 	u8* buffer = (u8*)dst;
 	int size = video.size();
-	u32* src = video.filteredbuffer32bpp;
+	u32* src = (u32*)video.finalBuffer();
 	switch(video.rotation)
 	{
 	case 0:
@ -881,24 +881,24 @@ static void DoDisplay(bool firstTime)
 			aggDraw.hud->attach(video.srcBuffer, 256, 384, 512);
 			DoDisplay_DrawHud();
 		}
-		
-		//apply user's filter
-		video.filter();
 	}

 	//convert pixel format to 32bpp for compositing
 	//why do we do this over and over? well, we are compositing to 
 	//filteredbuffer32bpp, and it needs to get refreshed each frame..
 	const int size = video.size();
-	u16* src = video.finalBuffer();
+	u16* src = (u16*)video.srcBuffer;
 	for(int i=0;i<size;i++)
-		video.filteredbuffer32bpp[i] = RGB15TO24_REVERSE(src[i]);
+		video.buffer[i] = RGB15TO24_REVERSE(src[i]);
+
+	//apply user's filter
+	video.filter();

 	if(!CommonSettings.single_core)
 	{
 		//draw and composite the OSD (but not if we are drawing osd straight to screen)
 		DoDisplay_DrawHud();
-		T_AGG_RGBA target((u8*)video.filteredbuffer32bpp, video.width,video.height,video.width*4);
+		T_AGG_RGBA target((u8*)video.finalBuffer(), video.width,video.height,video.width*4);
 		target.transformImage(aggDraw.hud->image<T_AGG_PF_RGBA>(), 0,0,video.width-1,video.height-1);
 		aggDraw.hud->clear();
 	}
--- a/desmume/src/windows/video.h
+++ b/desmume/src/windows/video.h
@ -13,8 +13,8 @@ public:
 	int currentfilter;

 	u8* srcBuffer;
-	CACHE_ALIGN u8 filteredbuffer[4*256*192*4];
-	CACHE_ALIGN u32 filteredbuffer32bpp[4*256*192*2];
+	CACHE_ALIGN u32 buffer[4*256*192*2];
+	CACHE_ALIGN u32 filteredbuffer[4*256*192*2];

 	enum {
 		NONE,
@ -56,7 +56,7 @@ public:
 	u16* finalBuffer() const
 	{
 		if(currentfilter == NONE)
-			return (u16*)srcBuffer;
+			return (u16*)buffer;
 		else return (u16*)filteredbuffer;
 	}

@ -65,7 +65,7 @@ public:
 		src.Height = 384;
 		src.Width = 256;
 		src.Pitch = 512;
-		src.Surface = (u8*)srcBuffer;
+		src.Surface = (u8*)buffer;

 		dst.Height = 768;
 		dst.Width = 512;