win32: start converting filters to 32bit

2009-08-10 02:30:32 +00:00 · 2009-08-10 02:30:32 +00:00 · 88f0549e85
parent c4be07bc87
commit 88f0549e85
8 changed files with 4184 additions and 1158 deletions
--- a/desmume/src/windows/filter/2xsai.cpp
+++ b/desmume/src/windows/filter/2xsai.cpp
--- a/desmume/src/windows/filter/bilinear.cpp
+++ b/desmume/src/windows/filter/bilinear.cpp
@ -6,13 +6,15 @@
 #include "types.h"
-int systemRedShift    = 10;
+int systemRedShift    = 24;
-int systemGreenShift  = 0;
+int systemGreenShift  = 16;
-int systemBlueShift   = 5;
+int systemBlueShift   = 8;
-
+/*
 #define RGB1(r,g,b) ((r)>>3) << systemRedShift |\
  ((g) >> 3) << systemGreenShift |\
  ((b) >> 3) << systemBlueShift\
 */
 #define RGB1(r,g,b)(((r))<<systemRedShift) | (((g)) << systemGreenShift) | (((b)) << systemBlueShift) | ((255) << 0)
 static void fill_rgb_row_16(u16 *from, int src_width, u8 *row, int width)
 {
@ -37,6 +39,29 @@ static void fill_rgb_row_16(u16 *from, int src_width, u8 *row, int width)
  }
 }
 static void fill_rgb_row_32(u32 *from, int src_width, u8 *row, int width)
 {
  u8 *copy_start = row + src_width*3;
  u8 *all_stop = row + width*3;
  while (row < copy_start) {
    u32 color = *from++;
    *row++ = ((color >> systemRedShift) & 0x1f) << 3;
    *row++ = ((color >> systemGreenShift) & 0x1f) << 3;
    *row++ = ((color >> systemBlueShift) & 0x1f) << 3;
  }
  // any remaining elements to be written to 'row' are a replica of the
  // preceding pixel
  u8 *p = row-3;
  while (row < all_stop) {
    // we're guaranteed three elements per pixel; could unroll the loop
    // further, especially with a Duff's Device, but the gains would be
    // probably limited (judging by profiler output)
    *row++ = *p++;
    *row++ = *p++;
    *row++ = *p++;
  }
 }
 void Bilinear(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
              u8 *dstPtr, u32 dstPitch, int width, int height)
 {
@ -221,6 +246,188 @@ void BilinearPlus(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
  }
 }
 void Bilinear32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
                u8 *dstPtr, u32 dstPitch, int width, int height)
 {
  u8 row_cur[3*322];
  u8 row_next[3*322];
  u8 *rgb_row_cur = row_cur;
  u8 *rgb_row_next = row_next;
  u32 *to = (u32 *)dstPtr;
  u32 *to_odd = (u32 *)(dstPtr + dstPitch);
  int from_width = width;
  u32 *from = (u32 *)srcPtr;
  fill_rgb_row_32(from, from_width, rgb_row_cur, width+1);
  for(int y = 0; y < height; y++) {
    u32 *from_orig = from;
    u32 *to_orig = to;
    if (y+1 < height)
      fill_rgb_row_32(from+width+1, from_width, rgb_row_next,
                   width+1);
    else
      fill_rgb_row_32(from, from_width, rgb_row_next, width+1);
    // every pixel in the src region, is extended to 4 pixels in the
    // destination, arranged in a square 'quad'; if the current src
    // pixel is 'a', then in what follows 'b' is the src pixel to the
    // right, 'c' is the src pixel below, and 'd' is the src pixel to
    // the right and down
    u8 *cur_row  = rgb_row_cur;
    u8 *next_row = rgb_row_next;
    u8 *ar = cur_row++;
    u8 *ag = cur_row++;
    u8 *ab = cur_row++;
    u8 *cr = next_row++;
    u8 *cg = next_row++;
    u8 *cb = next_row++;
    for(int x=0; x < width; x++) {
      u8 *br = cur_row++;
      u8 *bg = cur_row++;
      u8 *bb = cur_row++;
      u8 *dr = next_row++;
      u8 *dg = next_row++;
      u8 *db = next_row++;
      // upper left pixel in quad: just copy it in
 	  int m = *ar;
 	  int mm = *ag;
 	  int mmmm = *ab;
 	  int mmm =  RGB1(*ar, *ag, *ab);
      *to++ = RGB1(*ar, *ag, *ab);
      // upper right
      *to++ = RGB1((*ar+*br)>>1, (*ag+*bg)>>1, (*ab+*bb)>>1);
      // lower left
      *to_odd++ = RGB1((*ar+*cr)>>1, (*ag+*cg)>>1, (*ab+*cb)>>1);
      // lower right
      *to_odd++ = RGB1((*ar+*br+*cr+*dr)>>2,
                      (*ag+*bg+*cg+*dg)>>2,
                      (*ab+*bb+*cb+*db)>>2);
      // 'b' becomes 'a', 'd' becomes 'c'
      ar = br;
      ag = bg;
      ab = bb;
      cr = dr;
      cg = dg;
      cb = db;
    }
    // the "next" rgb row becomes the current; the old current rgb row is
    // recycled and serves as the new "next" row
    u8 *temp;
    temp = rgb_row_cur;
    rgb_row_cur = rgb_row_next;
    rgb_row_next = temp;
    // update the pointers for start of next pair of lines
    from = (u32 *)((u8 *)from_orig + srcPitch);
    to = (u32 *)((u8 *)to_orig + (dstPitch << 1));
    to_odd = (u32 *)((u8 *)to + dstPitch);
  }
 }
 void BilinearPlus32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
                    u8 *dstPtr, u32 dstPitch, int width, int height)
 {
  u8 row_cur[3*322];
  u8 row_next[3*322];
  u8 *rgb_row_cur = row_cur;
  u8 *rgb_row_next = row_next;
  u32 *to = (u32 *)dstPtr;
  u32 *to_odd = (u32 *)(dstPtr + dstPitch);
  int from_width = width;
  u32 *from = (u32 *)srcPtr;
  fill_rgb_row_32(from, from_width, rgb_row_cur, width+1);
  for(int y = 0; y < height; y++) {
    u32 *from_orig = from;
    u32 *to_orig = to;
    if (y+1 < height)
      fill_rgb_row_32(from+width+1, from_width, rgb_row_next,
                   width+1);
    else
      fill_rgb_row_32(from, from_width, rgb_row_next, width+1);
    // every pixel in the src region, is extended to 4 pixels in the
    // destination, arranged in a square 'quad'; if the current src
    // pixel is 'a', then in what follows 'b' is the src pixel to the
    // right, 'c' is the src pixel below, and 'd' is the src pixel to
    // the right and down
    u8 *cur_row  = rgb_row_cur;
    u8 *next_row = rgb_row_next;
    u8 *ar = cur_row++;
    u8 *ag = cur_row++;
    u8 *ab = cur_row++;
    u8 *cr = next_row++;
    u8 *cg = next_row++;
    u8 *cb = next_row++;
    for(int x=0; x < width; x++) {
      u8 *br = cur_row++;
      u8 *bg = cur_row++;
      u8 *bb = cur_row++;
      u8 *dr = next_row++;
      u8 *dg = next_row++;
      u8 *db = next_row++;
      // upper left pixel in quad: just copy it in
      //*to++ = manip.rgb(*ar, *ag, *ab);
 #ifdef USE_ORIGINAL_BILINEAR_PLUS
      *to++ = RGB(
                  (((*ar)<<2) +((*ar)) + (*cr+*br+*br) )>> 3,
                  (((*ag)<<2) +((*ag)) + (*cg+*bg+*bg) )>> 3,
                  (((*ab)<<2) +((*ab)) + (*cb+*bb+*bb) )>> 3);
 #else
      *to++ = RGB1(
                  (((*ar)<<3) +((*ar)<<1) + (*cr+*br+*br+*cr) )>> 4,
                  (((*ag)<<3) +((*ag)<<1) + (*cg+*bg+*bg+*cg) )>> 4,
                  (((*ab)<<3) +((*ab)<<1) + (*cb+*bb+*bb+*cb) )>> 4);
 #endif
      // upper right
      *to++ = RGB1((*ar+*br)>>1, (*ag+*bg)>>1, (*ab+*bb)>>1);
      // lower left
      *to_odd++ = RGB1((*ar+*cr)>>1, (*ag+*cg)>>1, (*ab+*cb)>>1);
      // lower right
      *to_odd++ = RGB1((*ar+*br+*cr+*dr)>>2,
                      (*ag+*bg+*cg+*dg)>>2,
                      (*ab+*bb+*cb+*db)>>2);
      // 'b' becomes 'a', 'd' becomes 'c'
      ar = br;
      ag = bg;
      ab = bb;
      cr = dr;
      cg = dg;
      cb = db;
    }
    // the "next" rgb row becomes the current; the old current rgb row is
    // recycled and serves as the new "next" row
    u8 *temp;
    temp = rgb_row_cur;
    rgb_row_cur = rgb_row_next;
    rgb_row_next = temp;
    // update the pointers for start of next pair of lines
    from = (u32 *)((u8 *)from_orig + srcPitch);
    to = (u32 *)((u8 *)to_orig + (dstPitch << 1));
    to_odd = (u32 *)((u8 *)to + dstPitch);
  }
 }
 void RenderBilinear (SSurface Src, SSurface Dst)
 {
@ -229,8 +436,8 @@ void RenderBilinear (SSurface Src, SSurface Dst)
    lpSrc = Src.Surface;
    lpDst = Dst.Surface;
-    Bilinear (lpSrc, Src.Pitch,
+    Bilinear32 (lpSrc, Src.Pitch*2,
                lpSrc,
-                lpDst, Dst.Pitch, Src.Width, Src.Height);
+                lpDst, Dst.Pitch*2, Src.Width, Src.Height);
 }
--- a/desmume/src/windows/filter/hq2x.cpp
+++ b/desmume/src/windows/filter/hq2x.cpp
--- a/desmume/src/windows/filter/hq2x.h
+++ b/desmume/src/windows/filter/hq2x.h
--- a/desmume/src/windows/filter/interp.h
+++ b/desmume/src/windows/filter/interp.h
@ -0,0 +1,355 @@
 /*
 * This file is part of the Advance project.
 *
 * Copyright (C) 2003 Andrea Mazzoleni
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * In addition, as a special exception, Andrea Mazzoleni
 * gives permission to link the code of this program with
 * the MAME library (or with modified versions of MAME that use the
 * same license as MAME), and distribute linked combinations including
 * the two.  You must obey the GNU General Public License in all
 * respects for all of the code used other than MAME.  If you modify
 * this file, you may extend this exception to your version of the
 * file, but you are not obligated to do so.  If you do not wish to
 * do so, delete this exception statement from your version.
 */
 #ifndef __INTERP_H
 #define __INTERP_H
 typedef unsigned char u8;
 typedef unsigned short u16;
 typedef unsigned int u32;
 /***************************************************************************/
 /* Basic types */
 /***************************************************************************/
 /* interpolation */
 extern unsigned interp_mask[2];
 extern unsigned interp_bits_per_pixel;
 #define INTERP_16_MASK_1(v) (v & interp_mask[0])
 #define INTERP_16_MASK_2(v) (v & interp_mask[1])
 static inline u16 interp_16_521(u16 p1, u16 p2, u16 p3)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*5 + INTERP_16_MASK_1(p2)*2 + INTERP_16_MASK_1(p3)*1) / 8)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*5 + INTERP_16_MASK_2(p2)*2 + INTERP_16_MASK_2(p3)*1) / 8);
 }
 static inline u16 interp_16_332(u16 p1, u16 p2, u16 p3)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*3 + INTERP_16_MASK_1(p2)*3 + INTERP_16_MASK_1(p3)*2) / 8)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*3 + INTERP_16_MASK_2(p2)*3 + INTERP_16_MASK_2(p3)*2) / 8);
 }
 static inline u16 interp_16_611(u16 p1, u16 p2, u16 p3)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*6 + INTERP_16_MASK_1(p2) + INTERP_16_MASK_1(p3)) / 8)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*6 + INTERP_16_MASK_2(p2) + INTERP_16_MASK_2(p3)) / 8);
 }
 static inline u16 interp_16_71(u16 p1, u16 p2)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*7 + INTERP_16_MASK_1(p2)) / 8)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*7 + INTERP_16_MASK_2(p2)) / 8);
 }
 static inline u16 interp_16_211(u16 p1, u16 p2, u16 p3)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*2 + INTERP_16_MASK_1(p2) + INTERP_16_MASK_1(p3)) / 4)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*2 + INTERP_16_MASK_2(p2) + INTERP_16_MASK_2(p3)) / 4);
 }
 static inline u16 interp_16_772(u16 p1, u16 p2, u16 p3)
 {
  return INTERP_16_MASK_1(((INTERP_16_MASK_1(p1) + INTERP_16_MASK_1(p2))*7 + INTERP_16_MASK_1(p3)*2) / 16)
    | INTERP_16_MASK_2(((INTERP_16_MASK_2(p1) + INTERP_16_MASK_2(p2))*7 + INTERP_16_MASK_2(p3)*2) / 16);
 }
 static inline u16 interp_16_11(u16 p1, u16 p2)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1) + INTERP_16_MASK_1(p2)) / 2)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1) + INTERP_16_MASK_2(p2)) / 2);
 }
 static inline u16 interp_16_31(u16 p1, u16 p2)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*3 + INTERP_16_MASK_1(p2)) / 4)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*3 + INTERP_16_MASK_2(p2)) / 4);
 }
 static inline u16 interp_16_1411(u16 p1, u16 p2, u16 p3)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*14 + INTERP_16_MASK_1(p2) + INTERP_16_MASK_1(p3)) / 16)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*14 + INTERP_16_MASK_2(p2) + INTERP_16_MASK_2(p3)) / 16);
 }
 static inline u16 interp_16_431(u16 p1, u16 p2, u16 p3)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*4 + INTERP_16_MASK_1(p2)*3 + INTERP_16_MASK_1(p3)) / 8)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*4 + INTERP_16_MASK_2(p2)*3 + INTERP_16_MASK_2(p3)) / 8);
 }
 static inline u16 interp_16_53(u16 p1, u16 p2)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*5 + INTERP_16_MASK_1(p2)*3) / 8)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*5 + INTERP_16_MASK_2(p2)*3) / 8);
 }
 static inline u16 interp_16_151(u16 p1, u16 p2)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*15 + INTERP_16_MASK_1(p2)) / 16)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*15 + INTERP_16_MASK_2(p2)) / 16);
 }
 static inline u16 interp_16_97(u16 p1, u16 p2)
 {
  return INTERP_16_MASK_1((INTERP_16_MASK_1(p1)*9 + INTERP_16_MASK_1(p2)*7) / 16)
    | INTERP_16_MASK_2((INTERP_16_MASK_2(p1)*9 + INTERP_16_MASK_2(p2)*7) / 16);
 }
 #define INTERP_32_MASK_1(v) (v & 0xFF00FF)
 #define INTERP_32_MASK_2(v) (v & 0x00FF00)
 static inline u32 interp_32_521(u32 p1, u32 p2, u32 p3)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*5 + INTERP_32_MASK_1(p2)*2 + INTERP_32_MASK_1(p3)*1) / 8)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*5 + INTERP_32_MASK_2(p2)*2 + INTERP_32_MASK_2(p3)*1) / 8);
 }
 static inline u32 interp_32_332(u32 p1, u32 p2, u32 p3)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*3 + INTERP_32_MASK_1(p2)*3 + INTERP_32_MASK_1(p3)*2) / 8)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*3 + INTERP_32_MASK_2(p2)*3 + INTERP_32_MASK_2(p3)*2) / 8);
 }
 static inline u32 interp_32_211(u32 p1, u32 p2, u32 p3)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*2 + INTERP_32_MASK_1(p2) + INTERP_32_MASK_1(p3)) / 4)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*2 + INTERP_32_MASK_2(p2) + INTERP_32_MASK_2(p3)) / 4);
 }
 static inline u32 interp_32_611(u32 p1, u32 p2, u32 p3)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*6 + INTERP_32_MASK_1(p2) + INTERP_32_MASK_1(p3)) / 8)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*6 + INTERP_32_MASK_2(p2) + INTERP_32_MASK_2(p3)) / 8);
 }
 static inline u32 interp_32_71(u32 p1, u32 p2)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*7 + INTERP_32_MASK_1(p2)) / 8)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*7 + INTERP_32_MASK_2(p2)) / 8);
 }
 static inline u32 interp_32_772(u32 p1, u32 p2, u32 p3)
 {
  return INTERP_32_MASK_1(((INTERP_32_MASK_1(p1) + INTERP_32_MASK_1(p2))*7 + INTERP_32_MASK_1(p3)*2) / 16)
    | INTERP_32_MASK_2(((INTERP_32_MASK_2(p1) + INTERP_32_MASK_2(p2))*7 + INTERP_32_MASK_2(p3)*2) / 16);
 }
 static inline u32 interp_32_11(u32 p1, u32 p2)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1) + INTERP_32_MASK_1(p2)) / 2)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1) + INTERP_32_MASK_2(p2)) / 2);
 }
 static inline u32 interp_32_31(u32 p1, u32 p2)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*3 + INTERP_32_MASK_1(p2)) / 4)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*3 + INTERP_32_MASK_2(p2)) / 4);
 }
 static inline u32 interp_32_1411(u32 p1, u32 p2, u32 p3)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*14 + INTERP_32_MASK_1(p2) + INTERP_32_MASK_1(p3)) / 16)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*14 + INTERP_32_MASK_2(p2) + INTERP_32_MASK_2(p3)) / 16);
 }
 static inline u32 interp_32_431(u32 p1, u32 p2, u32 p3)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*4 + INTERP_32_MASK_1(p2)*3 + INTERP_32_MASK_1(p3)) / 8)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*4 + INTERP_32_MASK_2(p2)*3 + INTERP_32_MASK_2(p3)) / 8);
 }
 static inline u32 interp_32_53(u32 p1, u32 p2)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*5 + INTERP_32_MASK_1(p2)*3) / 8)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*5 + INTERP_32_MASK_2(p2)*3) / 8);
 }
 static inline u32 interp_32_151(u32 p1, u32 p2)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*15 + INTERP_32_MASK_1(p2)) / 16)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*15 + INTERP_32_MASK_2(p2)) / 16);
 }
 static inline u32 interp_32_97(u32 p1, u32 p2)
 {
  return INTERP_32_MASK_1((INTERP_32_MASK_1(p1)*9 + INTERP_32_MASK_1(p2)*7) / 16)
    | INTERP_32_MASK_2((INTERP_32_MASK_2(p1)*9 + INTERP_32_MASK_2(p2)*7) / 16);
 }
 /***************************************************************************/
 /* diff */
 #define INTERP_Y_LIMIT (0x30*4)
 #define INTERP_U_LIMIT (0x07*4)
 #define INTERP_V_LIMIT (0x06*8)
 static int interp_16_diff(u16 p1, u16 p2)
 {
  int r, g, b;
  int y, u, v;
  if (p1 == p2)
    return 0;
  if (interp_bits_per_pixel == 16) {
    b = (int)((p1 & 0x1F) - (p2 & 0x1F)) << 3;
    g = (int)((p1 & 0x7E0) - (p2 & 0x7E0)) >> 3;
    r = (int)((p1 & 0xF800) - (p2 & 0xF800)) >> 8;
  } else {
    b = (int)((p1 & 0x1F) - (p2 & 0x1F)) << 3;
    g = (int)((p1 & 0x3E0) - (p2 & 0x3E0)) >> 2;
    r = (int)((p1 & 0x7C00) - (p2 & 0x7C00)) >> 7;
  }
  y = r + g + b;
  u = r - b;
  v = -r + 2*g - b;
  if (y < -INTERP_Y_LIMIT || y > INTERP_Y_LIMIT)
    return 1;
  if (u < -INTERP_U_LIMIT || u > INTERP_U_LIMIT)
    return 1;
  if (v < -INTERP_V_LIMIT || v > INTERP_V_LIMIT)
    return 1;
 return 0;
 }
 static int interp_32_diff(u32 p1, u32 p2)
 {
  int r, g, b;
  int y, u, v;
  if ((p1 & 0xF8F8F8) == (p2 & 0xF8F8F8))
    return 0;
  b = (int)((p1 & 0xFF) - (p2 & 0xFF));
  g = (int)((p1 & 0xFF00) - (p2 & 0xFF00)) >> 8;
  r = (int)((p1 & 0xFF0000) - (p2 & 0xFF0000)) >> 16;
  y = r + g + b;
  u = r - b;
  v = -r + 2*g - b;
  if (y < -INTERP_Y_LIMIT || y > INTERP_Y_LIMIT)
    return 1;
  if (u < -INTERP_U_LIMIT || u > INTERP_U_LIMIT)
    return 1;
  if (v < -INTERP_V_LIMIT || v > INTERP_V_LIMIT)
    return 1;
  return 0;
 }
 #define INTERP_LIMIT2 (96000)
 #define ABS(x) ((x) < 0 ? -(x) : (x))
 #define MAX(x,y) ((x) > (y) ? (x) : (y))
 #define MIN(x,y) ((x) < (y) ? (x) : (y))
 static int interp_16_diff2(u16 p1, u16 p2)
 {
  int r, g, b;
  int y, u, v;
  if ((p1 & 0xF79E) == (p2 & 0xF79E))
    return 0;
  if (interp_bits_per_pixel == 16) {
    b = (int)((p1 & 0x1F) - (p2 & 0x1F)) << 3;
    g = (int)((p1 & 0x7E0) - (p2 & 0x7E0)) >> 3;
    r = (int)((p1 & 0xF800) - (p2 & 0xF800)) >> 8;
  } else {
    b = (int)((p1 & 0x1F) - (p2 & 0x1F)) << 3;
    g = (int)((p1 & 0x3E0) - (p2 & 0x3E0)) >> 2;
    r = (int)((p1 & 0x7C00) - (p2 & 0x7C00)) >> 7;
  }
 //  yb =  30*r + 58*g + 12*b;
  y =  33*r + 36*g + 31*b;
  u = -14*r - 29*g + 44*b;
  v =  62*r - 51*g - 10*b;
  if (11*ABS(y) + 8*ABS(u) + 6*ABS(v) > INTERP_LIMIT2)
    return 1;
  return 0;
 }
 static int interp_32_diff2(u32 p1, u32 p2)
 {
  int r, g, b;
  int y, u, v;
  if ((p1 & 0xF0F0F0) == (p2 & 0xF0F0F0))
    return 0;
  b = (int)((p1 & 0xF8) - (p2 & 0xF8));
  g = (int)((p1 & 0xF800) - (p2 & 0xF800)) >> 8;
  r = (int)((p1 & 0xF80000) - (p2 & 0xF80000)) >> 16;
 //  y =  30*r + 58*g + 12*b;
  y =  33*r + 36*g + 31*b;
  u = -14*r - 29*g + 44*b;
  v =  62*r - 51*g - 10*b;
  if (11*ABS(y) + 8*ABS(u) + 6*ABS(v) > INTERP_LIMIT2)
    return 1;
  return 0;
 }
 static void interp_set(unsigned bits_per_pixel)
 {
  interp_bits_per_pixel = bits_per_pixel;
  switch (bits_per_pixel) {
  case 15 :
    interp_mask[0] = 0x7C1F;
    interp_mask[1] = 0x03E0;
    break;
  case 16 :
    interp_mask[0] = 0xF81F;
    interp_mask[1] = 0x07E0;
    break;
  case 32 :
    interp_mask[0] = 0xFF00FF;
    interp_mask[1] = 0x00FF00;
    break;
  }
 }
 #endif
--- a/desmume/src/windows/filter/scanline.cpp
+++ b/desmume/src/windows/filter/scanline.cpp
@ -42,7 +42,7 @@ FORCEINLINE void ScanLine16_2( uint16 *lpDst, uint16 *lpSrc, unsigned int Width)
 	}
 }
-FORCEINLINE void DoubleLine16( uint16 *lpDst, uint16 *lpSrc, unsigned int Width){
+FORCEINLINE void DoubleLine32( uint32 *lpDst, uint32 *lpSrc, unsigned int Width){
 	while(Width--){
 		*lpDst++ = *lpSrc;
 		*lpDst++ = *lpSrc++;
@ -69,17 +69,17 @@ void RenderScanline( SSurface Src, SSurface Dst)
 void RenderNearest2X (SSurface Src, SSurface Dst)
 {
-	uint16 *lpSrc;
+	uint32 *lpSrc;
 	unsigned int H;
 	const uint32 srcHeight = Src.Height;
 	const unsigned int srcPitch = Src.Pitch >> 1;
-	lpSrc = reinterpret_cast<uint16 *>(Src.Surface);
+	lpSrc = reinterpret_cast<uint32 *>(Src.Surface);
 	const unsigned int dstPitch = Dst.Pitch >> 1;
-	uint16 *lpDst = (uint16*)Dst.Surface;
+	uint32 *lpDst = (uint32*)Dst.Surface;
 	for (H = 0; H < srcHeight; H++, lpSrc += srcPitch)
-		DoubleLine16 (lpDst, lpSrc, Src.Width), lpDst += dstPitch,
+		DoubleLine32 (lpDst, lpSrc, Src.Width), lpDst += dstPitch,
-		DoubleLine16 (lpDst, lpSrc, Src.Width), lpDst += dstPitch;
+		DoubleLine32 (lpDst, lpSrc, Src.Width), lpDst += dstPitch;
 }
--- a/desmume/src/windows/main.cpp
+++ b/desmume/src/windows/main.cpp
@ -720,7 +720,7 @@ template<typename T, int bpp> static void doRotate(void* dst)
 {
 	u8* buffer = (u8*)dst;
 	int size = video.size();
-	u32* src = video.filteredbuffer32bpp;
+	u32* src = (u32*)video.finalBuffer();
 	switch(video.rotation)
 	{
 	case 0:
@ -881,24 +881,24 @@ static void DoDisplay(bool firstTime)
 			aggDraw.hud->attach(video.srcBuffer, 256, 384, 512);
 			DoDisplay_DrawHud();
 		}
 		//apply user's filter
 		video.filter();
 	}
 	//convert pixel format to 32bpp for compositing
 	//why do we do this over and over? well, we are compositing to 
 	//filteredbuffer32bpp, and it needs to get refreshed each frame..
 	const int size = video.size();
-	u16* src = video.finalBuffer();
+	u16* src = (u16*)video.srcBuffer;
 	for(int i=0;i<size;i++)
-		video.filteredbuffer32bpp[i] = RGB15TO24_REVERSE(src[i]);
+		video.buffer[i] = RGB15TO24_REVERSE(src[i]);
 	//apply user's filter
 	video.filter();
 	if(!CommonSettings.single_core)
 	{
 		//draw and composite the OSD (but not if we are drawing osd straight to screen)
 		DoDisplay_DrawHud();
-		T_AGG_RGBA target((u8*)video.filteredbuffer32bpp, video.width,video.height,video.width*4);
+		T_AGG_RGBA target((u8*)video.finalBuffer(), video.width,video.height,video.width*4);
 		target.transformImage(aggDraw.hud->image<T_AGG_PF_RGBA>(), 0,0,video.width-1,video.height-1);
 		aggDraw.hud->clear();
 	}
--- a/desmume/src/windows/video.h
+++ b/desmume/src/windows/video.h
@ -13,8 +13,8 @@ public:
 	int currentfilter;
 	u8* srcBuffer;
-	CACHE_ALIGN u8 filteredbuffer[4*256*192*4];
+	CACHE_ALIGN u32 buffer[4*256*192*2];
-	CACHE_ALIGN u32 filteredbuffer32bpp[4*256*192*2];
+	CACHE_ALIGN u32 filteredbuffer[4*256*192*2];
 	enum {
 		NONE,
@ -56,7 +56,7 @@ public:
 	u16* finalBuffer() const
 	{
 		if(currentfilter == NONE)
-			return (u16*)srcBuffer;
+			return (u16*)buffer;
 		else return (u16*)filteredbuffer;
 	}
@ -65,7 +65,7 @@ public:
 		src.Height = 384;
 		src.Width = 256;
 		src.Pitch = 512;
-		src.Surface = (u8*)srcBuffer;
+		src.Surface = (u8*)buffer;
 		dst.Height = 768;
 		dst.Width = 512;