Update to xBRZ `1.7`.

Get files from here `https://sourceforge.net/projects/xbrz/files/xBRZ/`. Then, update the src to handle pitch params. For our case, the pitch is necessary because we deal with borders (top and right) of our source image. Normally, we would want to scale without it, and therefore we need to adjust the pointers to skip the borders. If we have a `width + 1 pixel border` per line, the we need to scale a image with `width` line size, but advancing the pointer for each new line processed including the border on the count. Also, since our output pointer also allocates for the border, we need to adjust the output moving pointer for each line in a custom way. (output border in this case) - Fix #164.
2019-08-22 13:55:06 -03:00 · 2019-08-22 13:55:06 -03:00 · 234f1e9b1a
parent 290012d1a7
commit 234f1e9b1a
7 changed files with 650 additions and 307 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -814,8 +814,9 @@ set(
    src/filters/hq2x.h
    src/filters/interp.h
    src/filters/lq2x.h
-    src/filters/xBRZ/config.h
+    src/filters/xBRZ/xbrz_config.h
    src/filters/xBRZ/xbrz.h
+    src/filters/xBRZ/xbrz_tools.h
 )

 set(
--- a/src/filters/xBRZ/xbrz.cpp
+++ b/src/filters/xBRZ/xbrz.cpp
@ -1,13 +1,14 @@
 // ****************************************************************************
-// * This file is part of the HqMAME project. It is distributed under         *
-// * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0          *
+// * This file is part of the xBRZ project. It is distributed under           *
+// * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
 // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
 // *                                                                          *
 // * Additionally and as a special exception, the author gives permission     *
-// * to link the code of this program with the MAME library (or with modified *
-// * versions of MAME that use the same license as MAME), and distribute      *
-// * linked combinations including the two. You must obey the GNU General     *
-// * Public License in all respects for all of the code used other than MAME. *
+// * to link the code of this program with the following libraries            *
+// * (or with modified versions that use the same licenses), and distribute   *
+// * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
+// * You must obey the GNU General Public License in all respects for all of  *
+// * the code used other than MAME, FreeFileSync, Snes9x, ePSXe.              *
 // * If you modify this file, you may extend this exception to your version   *
 // * of the file, but you are not obligated to do so. If you do not wish to   *
 // * do so, delete this exception statement from your version.                *
@ -15,60 +16,33 @@

 #include "xbrz.h"
 #include <cassert>
-#include <cmath> // abs, pow, sqrt
-#include <algorithm>
 #include <vector>
+#include <algorithm>
+#include <cmath> //std::sqrt
+#include "xbrz_tools.h"
+
+using namespace xbrz;
+

 namespace
 {
-#ifdef _MSC_VER
-    #define FORCE_INLINE __forceinline
-#elif defined __GNUC__
-    #define FORCE_INLINE __attribute__((always_inline)) inline
-#else
-    #define FORCE_INLINE inline
-#endif
-
-
-template <uint32_t N> inline
-unsigned char getByte(uint32_t val) { return static_cast<unsigned char>((val >> (8 * N)) & 0xff); }
-
-inline unsigned char getAlpha(uint32_t pix) { return getByte<3>(pix); }
-inline unsigned char getRed  (uint32_t pix) { return getByte<2>(pix); }
-inline unsigned char getGreen(uint32_t pix) { return getByte<1>(pix); }
-inline unsigned char getBlue (uint32_t pix) { return getByte<0>(pix); }
-
-inline uint32_t makePixel(                 unsigned char r, unsigned char g, unsigned char b) { return             (r << 16) | (g << 8) | b; }
-inline uint32_t makePixel(unsigned char a, unsigned char r, unsigned char g, unsigned char b) { return (a << 24) | (r << 16) | (g << 8) | b; }
-
-
-template <unsigned int M, unsigned int N>
-FORCE_INLINE
-unsigned char calcColor(unsigned char colFront, unsigned char colBack)
-{
-    return (colFront * M + colBack * (N - M)) / N;
-}
-
 template <unsigned int M, unsigned int N> inline
 uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
 {
-    //~ static_assert(0 < M && M < N && N <= 1000, "");
+    static_assert(0 < M && M < N && N <= 1000);

-    return makePixel(calcColor<M, N>(getRed  (pixFront), getRed  (pixBack)),
-                     calcColor<M, N>(getGreen(pixFront), getGreen(pixBack)),
-                     calcColor<M, N>(getBlue (pixFront), getBlue (pixBack)));
+    auto calcColor = [](unsigned char colFront, unsigned char colBack) -> unsigned char { return (colFront * M + colBack * (N - M)) / N; };
+
+    return makePixel(calcColor(getRed  (pixFront), getRed  (pixBack)),
+                     calcColor(getGreen(pixFront), getGreen(pixBack)),
+                     calcColor(getBlue (pixFront), getBlue (pixBack)));
 }

-FORCE_INLINE
-unsigned char calcColor(unsigned char colFront, unsigned char colBack, const unsigned int weightFront, const unsigned int weightBack, const unsigned int weightSum)
-{
-    return static_cast<unsigned char>((colFront * weightFront + colBack * weightBack) / weightSum);
-}

 template <unsigned int M, unsigned int N> inline
 uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
 {
-    //~ static_assert(0 < M && M < N && N <= 1000, "");
+    static_assert(0 < M && M < N && N <= 1000);

    const unsigned int weightFront = getAlpha(pixFront) * M;
    const unsigned int weightBack  = getAlpha(pixBack) * (N - M);
@ -76,10 +50,15 @@ uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate c
    if (weightSum == 0)
        return 0;

+    auto calcColor = [=](unsigned char colFront, unsigned char colBack)
+    {
+        return static_cast<unsigned char>((colFront * weightFront + colBack * weightBack) / weightSum);
+    };
+
    return makePixel(static_cast<unsigned char>(weightSum / N),
-                     calcColor(getRed  (pixFront), getRed  (pixBack), weightFront, weightBack, weightSum),
-                     calcColor(getGreen(pixFront), getGreen(pixBack), weightFront, weightBack, weightSum),
-                     calcColor(getBlue (pixFront), getBlue (pixBack), weightFront, weightBack, weightSum));
+                     calcColor(getRed  (pixFront), getRed  (pixBack)),
+                     calcColor(getGreen(pixFront), getGreen(pixBack)),
+                     calcColor(getBlue (pixFront), getBlue (pixBack)));
 }


@ -95,24 +74,13 @@ uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate c
 //


-uint32_t*       byteAdvance(      uint32_t* ptr, int bytes) { return reinterpret_cast<      uint32_t*>(reinterpret_cast<      char*>(ptr) + bytes); }
-const uint32_t* byteAdvance(const uint32_t* ptr, int bytes) { return reinterpret_cast<const uint32_t*>(reinterpret_cast<const char*>(ptr) + bytes); }
-
-
-//fill block  with the given color
-inline
-void fillBlock(uint32_t* trg, int pitch, uint32_t col, int blockWidth, int blockHeight)
-{
-    //for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
-    //    std::fill(trg, trg + blockWidth, col);
-
-    for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
-        for (int x = 0; x < blockWidth; ++x)
-            trg[x] = col;
-}
-
-inline
-void fillBlock(uint32_t* trg, int pitch, uint32_t col, int n) { fillBlock(trg, pitch, col, n, n); }
+#ifdef _MSC_VER
+    #define FORCE_INLINE __forceinline
+#elif defined __GNUC__
+    #define FORCE_INLINE __attribute__((always_inline)) inline
+#else
+    #define FORCE_INLINE inline
+#endif


 enum RotationDegree //clock-wise
@ -168,7 +136,7 @@ template <class T> inline
 T square(T value) { return value * value; }


-
+#if 0
 inline
 double distRGB(uint32_t pix1, uint32_t pix2)
 {
@ -179,6 +147,7 @@ double distRGB(uint32_t pix1, uint32_t pix2)
    //euklidean RGB distance
    return std::sqrt(square(r_diff) + square(g_diff) + square(b_diff));
 }
+#endif


 inline
@ -208,16 +177,20 @@ double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
 }


-struct DistYCbCrBuffer //30% perf boost compared to distYCbCr()!
+inline
+double distYCbCrBuffered(uint32_t pix1, uint32_t pix2)
 {
-public:
-    DistYCbCrBuffer() : buffer(256 * 256 * 256)
+    //30% perf boost compared to plain distYCbCr()!
+    //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
+    static const std::vector<float> diffToDist = []
    {
+        std::vector<float> tmp;
+
        for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
        {
-            const int r_diff = getByte<2>(i) * 2 - 255;
-            const int g_diff = getByte<1>(i) * 2 - 255;
-            const int b_diff = getByte<0>(i) * 2 - 255;
+            const int r_diff = static_cast<signed char>(getByte<2>(i)) * 2;
+            const int g_diff = static_cast<signed char>(getByte<1>(i)) * 2;
+            const int b_diff = static_cast<signed char>(getByte<0>(i)) * 2;

            const double k_b = 0.0593; //ITU-R BT.2020 conversion
            const double k_r = 0.2627; //
@ -230,29 +203,39 @@ public:
            const double c_b = scale_b * (b_diff - y);
            const double c_r = scale_r * (r_diff - y);

-            buffer[i] = static_cast<float>(std::sqrt(square(y) + square(c_b) + square(c_r)));
+            tmp.push_back(static_cast<float>(std::sqrt(square(y) + square(c_b) + square(c_r))));
        }
-    }
+        return tmp;
+    }();

-    double dist(uint32_t pix1, uint32_t pix2) const
-    {
-        //if (pix1 == pix2) -> 8% perf degradation!
-        //    return 0;
-        //if (pix1 > pix2)
-        //	  std::swap(pix1, pix2); -> 30% perf degradation!!!
+    //if (pix1 == pix2) -> 8% perf degradation!
+    //    return 0;
+    //if (pix1 < pix2)
+    //    std::swap(pix1, pix2); -> 30% perf degradation!!!

-        const int r_diff = static_cast<int>(getRed  (pix1)) - getRed  (pix2);
-        const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
-        const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);
+    const int r_diff = static_cast<int>(getRed  (pix1)) - getRed  (pix2);
+    const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
+    const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);

-        return buffer[(((r_diff + 255) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
-                      (((g_diff + 255) / 2) <<  8) |
-                      (( b_diff + 255) / 2)];
-    }
+    const size_t index = (static_cast<unsigned char>(r_diff / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
+                         (static_cast<unsigned char>(g_diff / 2) <<  8) |
+                         (static_cast<unsigned char>(b_diff / 2));

-private:
-    std::vector<float> buffer; //consumes 64 MB memory; using double is 2% faster, but takes 128 MB
-} distYCbCrBuffer;
+#if 0 //attention: the following calculation creates an asymmetric color distance!!! (e.g. r_diff=46 will be unpacked as 45, but r_diff=-46 unpacks to -47
+    const size_t index = (((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
+                         (((g_diff + 0xFF) / 2) <<  8) |
+                         (( b_diff + 0xFF) / 2);
+#endif
+    return diffToDist[index];
+}
+
+
+#if defined _MSC_VER && !defined NDEBUG
+    const int debugPixelX = -1;
+    const int debugPixelY = 58;
+
+    thread_local bool breakIntoDebugger = false;
+#endif


 enum BlendType
@ -280,20 +263,6 @@ struct Kernel_4x4 //kernel for preprocessing step
    /**/m, n, o, p;
 };

-template <class ColorDistance>
-FORCE_INLINE
-double dist(uint32_t pix1, uint32_t pix2, const xbrz::ScalerCfg& cfg)
-{
-    return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight);
-}
-
-template <class ColorDistance>
-FORCE_INLINE
-bool eq(uint32_t pix1, uint32_t pix2, const xbrz::ScalerCfg& cfg)
-{
-    return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight) < cfg.equalColorTolerance;
-}
-
 /*
 input kernel area naming convention:
 -----------------
@ -310,6 +279,11 @@ template <class ColorDistance>
 FORCE_INLINE //detect blend direction
 BlendResult preProcessCorners(const Kernel_4x4& ker, const xbrz::ScalerCfg& cfg) //result: F, G, J, K corners of "GradientType"
 {
+#if defined _MSC_VER && !defined NDEBUG
+    if (breakIntoDebugger)
+        __debugbreak(); //__asm int 3;
+#endif
+
    BlendResult result = {};

    if ((ker.f == ker.g &&
@ -318,9 +292,10 @@ BlendResult preProcessCorners(const Kernel_4x4& ker, const xbrz::ScalerCfg& cfg)
         ker.g == ker.k))
        return result;

-    const int weight = 4;
-    double jg = dist<ColorDistance>(ker.i, ker.f, cfg) + dist<ColorDistance>(ker.f, ker.c, cfg) + dist<ColorDistance>(ker.n, ker.k, cfg) + dist<ColorDistance>(ker.k, ker.h, cfg) + weight * dist<ColorDistance>(ker.j, ker.g, cfg);
-    double fk = dist<ColorDistance>(ker.e, ker.j, cfg) + dist<ColorDistance>(ker.j, ker.o, cfg) + dist<ColorDistance>(ker.b, ker.g, cfg) + dist<ColorDistance>(ker.g, ker.l, cfg) + weight * dist<ColorDistance>(ker.f, ker.k, cfg);
+    auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };
+
+    double jg = dist(ker.i, ker.f) + dist(ker.f, ker.c) + dist(ker.n, ker.k) + dist(ker.k, ker.h) + cfg.centerDirectionBias * dist(ker.j, ker.g);
+    double fk = dist(ker.e, ker.j) + dist(ker.j, ker.o) + dist(ker.b, ker.g) + dist(ker.g, ker.l) + cfg.centerDirectionBias * dist(ker.f, ker.k);

    if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
    {
@ -373,12 +348,12 @@ DEF_GETTER(g, c) DEF_GETTER(h, b) DEF_GETTER(i, a)
 #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
 DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
 DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
-DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i,	g)
+DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
 #undef DEF_GETTER


 //compress four blend types into a single byte
-inline BlendType getTopL   (unsigned char b) { return static_cast<BlendType>(0x3 & b); }
+//inline BlendType getTopL   (unsigned char b) { return static_cast<BlendType>(0x3 & b); }
 inline BlendType getTopR   (unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 2)); }
 inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
 inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
@ -397,15 +372,6 @@ template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { ret
 template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }


-#ifndef NDEBUG
-#ifdef _MSC_VER
-    int debugPixelX = -1;
-    int debugPixelY = 12;
-    __declspec(thread) bool breakIntoDebugger = false;
-#endif
-#endif
-
-
 /*
 input kernel area naming convention:
 -------------
@ -423,7 +389,7 @@ void blendPixel(const Kernel_3x3& ker,
                unsigned char blendInfo, //result of preprocessing all four corners of pixel "e"
                const xbrz::ScalerCfg& cfg)
 {
-#define a get_a<rotDeg>(ker)
+    //#define a get_a<rotDeg>(ker)
 #define b get_b<rotDeg>(ker)
 #define c get_c<rotDeg>(ker)
 #define d get_d<rotDeg>(ker)
@ -433,33 +399,44 @@ void blendPixel(const Kernel_3x3& ker,
 #define h get_h<rotDeg>(ker)
 #define i get_i<rotDeg>(ker)

-#ifndef NDEBUG
-#ifdef _MSC_VER
+#if defined _MSC_VER && !defined NDEBUG
    if (breakIntoDebugger)
        __debugbreak(); //__asm int 3;
-#endif
 #endif

    const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);

    if (getBottomR(blend) >= BLEND_NORMAL)
    {
-        bool doLineBlend = (getBottomR(blend) >= BLEND_DOMINANT) || !(
-            //make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
-            (getTopR(blend) != BLEND_NONE && !eq<ColorDistance>(e, g, cfg)) || //but support double-blending for 90° corners
-            (getBottomL(blend) != BLEND_NONE && !eq<ColorDistance>(e, c, cfg)) ||
-            //no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
-            (!eq<ColorDistance>(e, i, cfg) && eq<ColorDistance>(g, h, cfg) &&  eq<ColorDistance>(h , i, cfg) && eq<ColorDistance>(i, f, cfg) && eq<ColorDistance>(f, c, cfg))
-        );
+        auto eq   = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight) < cfg.equalColorTolerance; };
+        auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };

-        const uint32_t px = dist<ColorDistance>(e, f, cfg) <= dist<ColorDistance>(e, h, cfg) ? f : h; //choose most similar color
+        const bool doLineBlend = [&]() -> bool
+        {
+            if (getBottomR(blend) >= BLEND_DOMINANT)
+                return true;
+
+            //make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
+            if (getTopR(blend) != BLEND_NONE && !eq(e, g)) //but support double-blending for 90° corners
+                return false;
+            if (getBottomL(blend) != BLEND_NONE && !eq(e, c))
+                return false;
+
+            //no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
+            if (!eq(e, i) && eq(g, h) && eq(h, i) && eq(i, f) && eq(f, c))
+                return false;
+
+            return true;
+        }();
+
+        const uint32_t px = dist(e, f) <= dist(e, h) ? f : h; //choose most similar color

        OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);

        if (doLineBlend)
        {
-            const double fg = dist<ColorDistance>(f, g, cfg); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
-            const double hc = dist<ColorDistance>(h, c, cfg); //
+            const double fg = dist(f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
+            const double hc = dist(h, c); //

            const bool haveShallowLine = cfg.steepDirectionThreshold * fg <= hc && e != g && d != g;
            const bool haveSteepLine   = cfg.steepDirectionThreshold * hc <= fg && e != c && b != c;
@ -476,14 +453,14 @@ void blendPixel(const Kernel_3x3& ker,
                if (haveSteepLine)
                    Scaler::blendLineSteep(px, out);
                else
-                    Scaler::blendLineDiagonal(px,out);
+                    Scaler::blendLineDiagonal(px, out);
            }
        }
        else
            Scaler::blendCorner(px, out);
    }

-#undef a
+    //#undef a
 #undef b
 #undef c
 #undef d
@ -496,19 +473,22 @@ void blendPixel(const Kernel_3x3& ker,


 template <class Scaler, class ColorDistance> //scaler policy: see "Scaler2x" reference implementation
-void scaleImage(const uint32_t* src, int srcWidth, int srcHeight, int srcPitch, uint32_t* trg, int trgWidth, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
+void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, int srcPitch, int trgPitch, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
 {
    yFirst = std::max(yFirst, 0);
    yLast  = std::min(yLast, srcHeight);
    if (yFirst >= yLast || srcWidth <= 0)
        return;

+    //const int trgWidth = srcWidth * Scaler::scale;
+    int trgWidth = trgPitch;
+
    //"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
    //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
    const int bufferSize = srcWidth;
    unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
-    std::fill(preProcBuffer, preProcBuffer + bufferSize, 0);
-    //~ static_assert(BLEND_NONE == 0, "");
+    std::fill(preProcBuffer, preProcBuffer + bufferSize, '\0');
+    static_assert(BLEND_NONE == 0);

    //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
    //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
@ -578,10 +558,8 @@ void scaleImage(const uint32_t* src, int srcWidth, int srcHeight, int srcPitch,

        for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
        {
-#ifndef NDEBUG
-#ifdef _MSC_VER
+#if defined _MSC_VER && !defined NDEBUG
            breakIntoDebugger = debugPixelX == x && debugPixelY == y;
-#endif
 #endif
            //all those bounds checks have only insignificant impact on performance!
            const int x_m1 = std::max(x - 1, 0); //perf: prefer array indexing to additional pointers!
@ -636,7 +614,8 @@ void scaleImage(const uint32_t* src, int srcWidth, int srcHeight, int srcPitch,
            }

            //fill block of size scale * scale with the given color
-            fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale); //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
+            fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
+            //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!

            //blend four corners of current pixel
            if (blendingNeeded(blend_xy)) //good 5% perf-improvement
@ -1031,8 +1010,7 @@ struct ColorDistanceRGB
 {
    static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
    {
-        (void)luminanceWeight; // unused param
-        return distYCbCrBuffer.dist(pix1, pix2);
+        return distYCbCrBuffered(pix1, pix2);

        //if (pix1 == pix2) //about 4% perf boost
        //    return 0;
@ -1044,26 +1022,41 @@ struct ColorDistanceARGB
 {
    static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
    {
-        (void)luminanceWeight; // unused param
        const double a1 = getAlpha(pix1) / 255.0 ;
        const double a2 = getAlpha(pix2) / 255.0 ;
        /*
        Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]

-        	1. if a1 = a2, distance should be: a1 * distYCbCr()
-        	2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
-        	3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
+            1. if a1 = a2, distance should be: a1 * distYCbCr()
+            2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
+            3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
        */

-        //return std::min(a1, a2) * distYCbCrBuffer.dist(pix1, pix2) + 255 * abs(a1 - a2);
+        //return std::min(a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
        //=> following code is 15% faster:
-        const double d = distYCbCrBuffer.dist(pix1, pix2);
+        const double d = distYCbCrBuffered(pix1, pix2);
        if (a1 < a2)
            return a1 * d + 255 * (a2 - a1);
        else
            return a2 * d + 255 * (a1 - a2);

-        //alternative? return std::sqrt(a1 * a2 * square(distYCbCrBuffer.dist(pix1, pix2)) + square(255 * (a1 - a2)));
+        //alternative? return std::sqrt(a1 * a2 * square(distYCbCrBuffered(pix1, pix2)) + square(255 * (a1 - a2)));
+    }
+};
+
+
+struct ColorDistanceUnbufferedARGB
+{
+    static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
+    {
+        const double a1 = getAlpha(pix1) / 255.0 ;
+        const double a2 = getAlpha(pix2) / 255.0 ;
+
+        const double d = distYCbCr(pix1, pix2, luminanceWeight);
+        if (a1 < a2)
+            return a1 * d + 255 * (a2 - a1);
+        else
+            return a2 * d + 255 * (a1 - a2);
    }
 };

@ -1088,51 +1081,65 @@ struct ColorGradientARGB
 }


-void xbrz::scale(size_t factor, const uint32_t* src, int srcWidth, int srcHeight, int srcPitch, uint32_t* trg, int trgPitch, ColorFormat colFmt, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
+void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, ColorFormat colFmt, int srcPitch, int trgPitch, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
 {
-    if (srcPitch %  static_cast<int>(sizeof(uint32_t)) != 0 ||
-        trgPitch %  static_cast<int>(sizeof(uint32_t)) != 0 ||
-        srcPitch < srcWidth * static_cast<int>(sizeof(uint32_t)) ||
-        trgPitch < srcWidth * static_cast<int>(factor) * static_cast<int>(sizeof(uint32_t)))
+    if (factor == 1)
    {
-        assert(false);
+        std::copy(src + yFirst * srcWidth, src + yLast * srcWidth, trg);
        return;
    }

-    int trgWidth = trgPitch / static_cast<int>(sizeof(uint32_t));
    int srcPPitch = srcPitch / static_cast<int>(sizeof(uint32_t));
+    int trgWidth = trgPitch / static_cast<int>(sizeof(uint32_t));

+    static_assert(SCALE_FACTOR_MAX == 6);
    switch (colFmt)
    {
-        case ARGB:
+        case ColorFormat::RGB:
            switch (factor)
            {
                case 2:
-                    return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB>(src, srcWidth, srcHeight, srcPPitch, trg, trgWidth, cfg, yFirst, yLast);
+                    return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
                case 3:
-                    return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB>(src, srcWidth, srcHeight, srcPPitch, trg, trgWidth, cfg, yFirst, yLast);
+                    return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
                case 4:
-                    return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB>(src, srcWidth, srcHeight, srcPPitch, trg, trgWidth, cfg, yFirst, yLast);
+                    return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
                case 5:
-                    return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB>(src, srcWidth, srcHeight, srcPPitch, trg, trgWidth, cfg, yFirst, yLast);
+                    return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
                case 6:
-                    return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB>(src, srcWidth, srcHeight, srcPPitch, trg, trgWidth, cfg, yFirst, yLast);
+                    return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
            }
            break;

-        case RGB:
+        case ColorFormat::ARGB:
            switch (factor)
            {
                case 2:
-                    return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB>(src, srcWidth, srcHeight, srcPPitch, trg, trgWidth, cfg, yFirst, yLast);
+                    return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
                case 3:
-                    return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB>(src, srcWidth, srcHeight, srcPPitch, trg, trgWidth, cfg, yFirst, yLast);
+                    return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
                case 4:
-                    return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB>(src, srcWidth, srcHeight, srcPPitch, trg, trgWidth, cfg, yFirst, yLast);
+                    return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
                case 5:
-                    return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB>(src, srcWidth, srcHeight, srcPPitch, trg, trgWidth, cfg, yFirst, yLast);
+                    return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
                case 6:
-                    return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB>(src, srcWidth, srcHeight, srcPPitch, trg, trgWidth, cfg, yFirst, yLast);
+                    return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
+            }
+            break;
+
+        case ColorFormat::ARGB_UNBUFFERED:
+            switch (factor)
+            {
+                case 2:
+                    return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
+                case 3:
+                    return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
+                case 4:
+                    return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
+                case 5:
+                    return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
+                case 6:
+                    return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, srcPPitch, trgWidth, cfg, yFirst, yLast);
            }
            break;
    }
@ -1144,84 +1151,133 @@ bool xbrz::equalColorTest(uint32_t col1, uint32_t col2, ColorFormat colFmt, doub
 {
    switch (colFmt)
    {
-        case ARGB:
-            return ColorDistanceARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
-
-        case RGB:
+        case ColorFormat::RGB:
            return ColorDistanceRGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
+        case ColorFormat::ARGB:
+            return ColorDistanceARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
+        case ColorFormat::ARGB_UNBUFFERED:
+            return ColorDistanceUnbufferedARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
    }
    assert(false);
    return false;
 }


-void xbrz::nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitch,
-                                uint32_t* trg, int trgWidth, int trgHeight, int trgPitch,
-                                SliceType st, int yFirst, int yLast)
+void xbrz::bilinearScale(const uint32_t* src, int srcWidth, int srcHeight,
+                         /**/  uint32_t* trg, int trgWidth, int trgHeight)
 {
-    if (srcPitch < srcWidth * static_cast<int>(sizeof(uint32_t))  ||
-        trgPitch < trgWidth * static_cast<int>(sizeof(uint32_t)))
-    {
-        assert(false);
-        return;
-    }
-
-    switch (st)
-    {
-        case NN_SCALE_SLICE_SOURCE:
-            //nearest-neighbor (going over source image - fast for upscaling, since source is read only once
-            yFirst = std::max(yFirst, 0);
-            yLast  = std::min(yLast, srcHeight);
-            if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0) return;
-
-            for (int y = yFirst; y < yLast; ++y)
-            {
-                //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
-                // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
-
-                //keep within for loop to support MT input slices!
-                const int yTrg_first = ( y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
-                const int yTrg_last  = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
-                const int blockHeight = yTrg_last - yTrg_first;
-
-                if (blockHeight > 0)
-                {
-                    const uint32_t* srcLine = byteAdvance(src, y * srcPitch);
-                    uint32_t* trgLine  = byteAdvance(trg, yTrg_first * trgPitch);
-                    int xTrg_first = 0;
-
-                    for (int x = 0; x < srcWidth; ++x)
-                    {
-                        int xTrg_last = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
-                        const int blockWidth = xTrg_last - xTrg_first;
-                        if (blockWidth > 0)
-                        {
-                            xTrg_first = xTrg_last;
-                            fillBlock(trgLine, trgPitch, srcLine[x], blockWidth, blockHeight);
-                            trgLine += blockWidth;
-                        }
-                    }
-                }
-            }
-            break;
-
-        case NN_SCALE_SLICE_TARGET:
-            //nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
-            yFirst = std::max(yFirst, 0);
-            yLast  = std::min(yLast, trgHeight);
-            if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0) return;
-
-            for (int y = yFirst; y < yLast; ++y)
-            {
-                uint32_t* trgLine = byteAdvance(trg, y * trgPitch);
-                const int ySrc = srcHeight * y / trgHeight;
-                const uint32_t* srcLine = byteAdvance(src, ySrc * srcPitch);
-                for (int x = 0; x < trgWidth; ++x)
-                {
-                    const int xSrc = srcWidth * x / trgWidth;
-                    trgLine[x] = srcLine[xSrc];
-                }
-            }
-            break;
-    }
+    bilinearScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
+                  trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
+    0, trgHeight, [](uint32_t pix) { return pix; });
 }
+
+
+void xbrz::nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight,
+                                /**/  uint32_t* trg, int trgWidth, int trgHeight)
+{
+    nearestNeighborScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
+                         trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
+    0, trgHeight, [](uint32_t pix) { return pix; });
+}
+
+
+#if 0
+//#include <ppl.h>
+void bilinearScaleCpu(const uint32_t* src, int srcWidth, int srcHeight,
+                      /**/  uint32_t* trg, int trgWidth, int trgHeight)
+{
+    const int TASK_GRANULARITY = 16;
+
+    concurrency::task_group tg;
+
+    for (int i = 0; i < trgHeight; i += TASK_GRANULARITY)
+        tg.run([=]
+    {
+        const int iLast = std::min(i + TASK_GRANULARITY, trgHeight);
+        xbrz::bilinearScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
+                            trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
+        i, iLast, [](uint32_t pix) { return pix; });
+    });
+    tg.wait();
+}
+
+
+//Perf: AMP vs CPU: merely ~10% shorter runtime (scaling 1280x800 -> 1920x1080)
+//#include <amp.h>
+void bilinearScaleAmp(const uint32_t* src, int srcWidth, int srcHeight, //throw concurrency::runtime_exception
+                      /**/  uint32_t* trg, int trgWidth, int trgHeight)
+{
+    //C++ AMP reference:       https://msdn.microsoft.com/en-us/library/hh289390.aspx
+    //introduction to C++ AMP: https://msdn.microsoft.com/en-us/magazine/hh882446.aspx
+    using namespace concurrency;
+    //TODO: pitch
+
+    if (srcHeight <= 0 || srcWidth <= 0) return;
+
+    const float scaleX = static_cast<float>(trgWidth ) / srcWidth;
+    const float scaleY = static_cast<float>(trgHeight) / srcHeight;
+
+    array_view<const uint32_t, 2> srcView(srcHeight, srcWidth, src);
+    array_view<      uint32_t, 2> trgView(trgHeight, trgWidth, trg);
+    trgView.discard_data();
+
+    parallel_for_each(trgView.extent, [=](index<2> idx) restrict(amp) //throw ?
+    {
+        const int y = idx[0];
+        const int x = idx[1];
+        //Perf notes:
+        //    -> float-based calculation is (almost) 2x as fas as double!
+        //    -> no noticeable improvement via tiling: https://msdn.microsoft.com/en-us/magazine/hh882447.aspx
+        //    -> no noticeable improvement with restrict(amp,cpu)
+        //    -> iterating over y-axis only is significantly slower!
+        //    -> pre-calculating x,y-dependent variables in a buffer + array_view<> is ~ 20 % slower!
+        const int y1 = srcHeight * y / trgHeight;
+        int y2 = y1 + 1;
+        if (y2 == srcHeight) --y2;
+
+        const float yy1 = y / scaleY - y1;
+        const float y2y = 1 - yy1;
+        //-------------------------------------
+        const int x1 = srcWidth * x / trgWidth;
+        int x2 = x1 + 1;
+        if (x2 == srcWidth) --x2;
+
+        const float xx1 = x / scaleX - x1;
+        const float x2x = 1 - xx1;
+        //-------------------------------------
+        const float x2xy2y = x2x * y2y;
+        const float xx1y2y = xx1 * y2y;
+        const float x2xyy1 = x2x * yy1;
+        const float xx1yy1 = xx1 * yy1;
+
+        auto interpolate = [=](int offset)
+        {
+            /*
+                https://en.wikipedia.org/wiki/Bilinear_interpolation
+                (c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
+                (c12(x2 - x) + c22(x - x1)) * (y  - y1)
+            */
+            const auto c11 = (srcView(y1, x1) >> (8 * offset)) & 0xff;
+            const auto c21 = (srcView(y1, x2) >> (8 * offset)) & 0xff;
+            const auto c12 = (srcView(y2, x1) >> (8 * offset)) & 0xff;
+            const auto c22 = (srcView(y2, x2) >> (8 * offset)) & 0xff;
+
+            return c11 * x2xy2y + c21 * xx1y2y +
+                   c12 * x2xyy1 + c22 * xx1yy1;
+        };
+
+        const float bi = interpolate(0);
+        const float gi = interpolate(1);
+        const float ri = interpolate(2);
+        const float ai = interpolate(3);
+
+        const auto b = static_cast<uint32_t>(bi + 0.5f);
+        const auto g = static_cast<uint32_t>(gi + 0.5f);
+        const auto r = static_cast<uint32_t>(ri + 0.5f);
+        const auto a = static_cast<uint32_t>(ai + 0.5f);
+
+        trgView(y, x) = (a << 24) | (r << 16) | (g << 8) | b;
+    });
+    trgView.synchronize(); //throw ?
+}
+#endif
--- a/src/filters/xBRZ/xbrz.h
+++ b/src/filters/xBRZ/xbrz.h
@ -1,13 +1,14 @@
 // ****************************************************************************
-// * This file is part of the HqMAME project. It is distributed under         *
-// * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0          *
+// * This file is part of the xBRZ project. It is distributed under           *
+// * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
 // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
 // *                                                                          *
 // * Additionally and as a special exception, the author gives permission     *
-// * to link the code of this program with the MAME library (or with modified *
-// * versions of MAME that use the same license as MAME), and distribute      *
-// * linked combinations including the two. You must obey the GNU General     *
-// * Public License in all respects for all of the code used other than MAME. *
+// * to link the code of this program with the following libraries            *
+// * (or with modified versions that use the same licenses), and distribute   *
+// * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
+// * You must obey the GNU General Public License in all respects for all of  *
+// * the code used other than MAME, FreeFileSync, Snes9x, ePSXe.              *
 // * If you modify this file, you may extend this exception to your version   *
 // * of the file, but you are not obligated to do so. If you do not wish to   *
 // * do so, delete this exception statement from your version.                *
@ -16,10 +17,11 @@
 #ifndef XBRZ_HEADER_3847894708239054
 #define XBRZ_HEADER_3847894708239054

-#include "config.h"
 #include <cstddef> //size_t
+#include <cstdint> //uint32_t
 #include <limits>
-#include <stdint.h> //uint32_t
+#include "xbrz_config.h"
+

 namespace xbrz
 {
@ -38,65 +40,41 @@ http://board.byuu.org/viewtopic.php?f=10&t=2248
 - support scaling up to 6xBRZ
 */

-enum ColorFormat // from high bits -> low bits, 8 bit per channel
-{ RGB,           // 8 bit for each red, green, blue, upper 8 bits unused
-  ARGB,          // including alpha channel, BGRA byte order on little-endian machines
+enum class ColorFormat //from high bits -> low bits, 8 bit per channel
+{
+    RGB,  //8 bit for each red, green, blue, upper 8 bits unused
+    ARGB, //including alpha channel, BGRA byte order on little-endian machines
+    ARGB_UNBUFFERED, //like ARGB, but without the one-time buffer creation overhead (ca. 100 - 300 ms) at the expense of a slightly slower scaling time
 };

+const int SCALE_FACTOR_MAX = 6;
+
 /*
-> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally
-processing a half-open slice of rows [yFirst, yLast) only
+-> map source (srcWidth * srcHeight) to target (scale * width x scale * height) image, optionally processing a half-open slice of rows [yFirst, yLast) only
 -> support for source/target pitch in bytes!
-> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no
-need to run xBRZ on the complete image:
-   Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the
-additional range the xBRZ algorithm is using during analysis)
-   Caveat: If there are multiple changed slices, make sure they do not overlap after adding these
-additional rows in order to avoid a memory race condition
+-> if your emulator changes only a few image slices during each cycle (e.g. DOSBox) then there's no need to run xBRZ on the complete image:
+   Just make sure you enlarge the source image slice by 2 rows on top and 2 on bottom (this is the additional range the xBRZ algorithm is using during analysis)
+   CAVEAT: If there are multiple changed slices, make sure they do not overlap after adding these additional rows in order to avoid a memory race condition
   in the target image data if you are using multiple threads for processing each enlarged slice!

-THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst,
-yLast) ranges do not overlap!
-               - there is a minor inefficiency for the first row of a slice, so avoid processing
-single rows only; suggestion: process 8-16 rows at least
+THREAD-SAFETY: - parts of the same image may be scaled by multiple threads as long as the [yFirst, yLast) ranges do not overlap!
+               - there is a minor inefficiency for the first row of a slice, so avoid processing single rows only; suggestion: process at least 8-16 rows
 */
-void scale(size_t factor, // valid range: 2 - 6
-           const uint32_t *src, int srcWidth, int srcHeight, int srcPitch, uint32_t *trg,
-           int trgPitch, ColorFormat colFmt, const ScalerCfg &cfg = ScalerCfg(), int yFirst = 0,
-           int yLast = std::numeric_limits<int>::max()); // slice of source image
+void scale(size_t factor, //valid range: 2 - SCALE_FACTOR_MAX
+           const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight,
+           ColorFormat colFmt, int srcPitch, int trgPitch,
+           const ScalerCfg& cfg = ScalerCfg(),
+           int yFirst = 0, int yLast = std::numeric_limits<int>::max()); //slice of source image

-void nearestNeighborScale(const uint32_t *src, int srcWidth, int srcHeight, uint32_t *trg,
-                          int trgWidth, int trgHeight);
+void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight,
+                   /**/  uint32_t* trg, int trgWidth, int trgHeight);

-enum SliceType {
-        NN_SCALE_SLICE_SOURCE,
-        NN_SCALE_SLICE_TARGET,
-};
-void nearestNeighborScale(const uint32_t *src, int srcWidth, int srcHeight,
-                          int srcPitch, // pitch in bytes!
-                          uint32_t *trg, int trgWidth, int trgHeight, int trgPitch, SliceType st,
-                          int yFirst, int yLast);
+void nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight,
+                          /**/  uint32_t* trg, int trgWidth, int trgHeight);

-// parameter tuning
-bool equalColorTest(uint32_t col1, uint32_t col2, ColorFormat colFmt, double luminanceWeight,
-                    double equalColorTolerance);

-//########################### implementation ###########################
-inline void nearestNeighborScale(const uint32_t *src, int srcWidth, int srcHeight, uint32_t *trg,
-                                 int trgWidth, int trgHeight)
-{
-        nearestNeighborScale(src,
-                             srcWidth,
-                             srcHeight,
-                             srcWidth * sizeof(uint32_t),
-                             trg,
-                             trgWidth,
-                             trgHeight,
-                             trgWidth * sizeof(uint32_t),
-                             NN_SCALE_SLICE_TARGET,
-                             0,
-                             trgHeight);
-}
+//parameter tuning
+bool equalColorTest(uint32_t col1, uint32_t col2, ColorFormat colFmt, double luminanceWeight, double equalColorTolerance);
 }

 #endif
--- a/src/filters/xBRZ/xbrz_config.h
+++ b/src/filters/xBRZ/xbrz_config.h
@ -0,0 +1,35 @@
+// ****************************************************************************
+// * This file is part of the xBRZ project. It is distributed under           *
+// * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
+// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
+// *                                                                          *
+// * Additionally and as a special exception, the author gives permission     *
+// * to link the code of this program with the following libraries            *
+// * (or with modified versions that use the same licenses), and distribute   *
+// * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
+// * You must obey the GNU General Public License in all respects for all of  *
+// * the code used other than MAME, FreeFileSync, Snes9x, ePSXe.              *
+// * If you modify this file, you may extend this exception to your version   *
+// * of the file, but you are not obligated to do so. If you do not wish to   *
+// * do so, delete this exception statement from your version.                *
+// ****************************************************************************
+
+#ifndef XBRZ_CONFIG_HEADER_284578425345
+#define XBRZ_CONFIG_HEADER_284578425345
+
+//do NOT include any headers here! used by xBRZ_dll!!!
+
+namespace xbrz
+{
+struct ScalerCfg
+{
+    double luminanceWeight            = 1;
+    double equalColorTolerance        = 30;
+    double centerDirectionBias        = 4;
+    double dominantDirectionThreshold = 3.6;
+    double steepDirectionThreshold    = 2.2;
+    double newTestAttribute           = 0; //unused; test new parameters
+};
+}
+
+#endif
--- a/src/filters/xBRZ/xbrz_tools.h
+++ b/src/filters/xBRZ/xbrz_tools.h
@ -0,0 +1,272 @@
+// ****************************************************************************
+// * This file is part of the xBRZ project. It is distributed under           *
+// * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
+// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
+// *                                                                          *
+// * Additionally and as a special exception, the author gives permission     *
+// * to link the code of this program with the following libraries            *
+// * (or with modified versions that use the same licenses), and distribute   *
+// * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
+// * You must obey the GNU General Public License in all respects for all of  *
+// * the code used other than MAME, FreeFileSync, Snes9x, ePSXe.              *
+// * If you modify this file, you may extend this exception to your version   *
+// * of the file, but you are not obligated to do so. If you do not wish to   *
+// * do so, delete this exception statement from your version.                *
+// ****************************************************************************
+
+#ifndef XBRZ_TOOLS_H_825480175091875
+#define XBRZ_TOOLS_H_825480175091875
+
+#include <cassert>
+#include <algorithm>
+#include <type_traits>
+
+
+namespace xbrz
+{
+template <uint32_t N> inline
+unsigned char getByte(uint32_t val) { return static_cast<unsigned char>((val >> (8 * N)) & 0xff); }
+
+inline unsigned char getAlpha(uint32_t pix) { return getByte<3>(pix); }
+inline unsigned char getRed  (uint32_t pix) { return getByte<2>(pix); }
+inline unsigned char getGreen(uint32_t pix) { return getByte<1>(pix); }
+inline unsigned char getBlue (uint32_t pix) { return getByte<0>(pix); }
+
+inline uint32_t makePixel(unsigned char a, unsigned char r, unsigned char g, unsigned char b) { return (a << 24) | (r << 16) | (g << 8) | b; }
+inline uint32_t makePixel(                 unsigned char r, unsigned char g, unsigned char b) { return             (r << 16) | (g << 8) | b; }
+
+inline uint32_t rgb555to888(uint16_t pix) { return ((pix & 0x7C00) << 9) | ((pix & 0x03E0) << 6) | ((pix & 0x001F) << 3); }
+inline uint32_t rgb565to888(uint16_t pix) { return ((pix & 0xF800) << 8) | ((pix & 0x07E0) << 5) | ((pix & 0x001F) << 3); }
+
+inline uint16_t rgb888to555(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 9) | ((pix & 0x00F800) >> 6) | ((pix & 0x0000F8) >> 3)); }
+inline uint16_t rgb888to565(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 8) | ((pix & 0x00FC00) >> 5) | ((pix & 0x0000F8) >> 3)); }
+
+
+template <class Pix> inline
+Pix* byteAdvance(Pix* ptr, int bytes)
+{
+    using PixNonConst = typename std::remove_cv<Pix>::type;
+    using PixByte     = typename std::conditional<std::is_same<Pix, PixNonConst>::value, char, const char>::type;
+
+    static_assert(std::is_integral<PixNonConst>::value, "Pix* is expected to be cast-able to char*");
+
+    return reinterpret_cast<Pix*>(reinterpret_cast<PixByte*>(ptr) + bytes);
+}
+
+
+//fill block  with the given color
+template <class Pix> inline
+void fillBlock(Pix* trg, int pitch, Pix col, int blockWidth, int blockHeight)
+{
+    //for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
+    //    std::fill(trg, trg + blockWidth, col);
+
+    for (int y = 0; y < blockHeight; ++y, trg = byteAdvance(trg, pitch))
+        for (int x = 0; x < blockWidth; ++x)
+            trg[x] = col;
+}
+
+
+//nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
+template <class PixSrc, class PixTrg, class PixConverter>
+void nearestNeighborScale(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
+                          /**/  PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
+                          int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
+{
+    static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
+    static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
+
+    static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
+
+    if (srcPitch < srcWidth * static_cast<int>(sizeof(PixSrc))  ||
+        trgPitch < trgWidth * static_cast<int>(sizeof(PixTrg)))
+    {
+        assert(false);
+        return;
+    }
+
+    yFirst = std::max(yFirst, 0);
+    yLast  = std::min(yLast, trgHeight);
+    if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0) return;
+
+    for (int y = yFirst; y < yLast; ++y)
+    {
+        const int ySrc = srcHeight * y / trgHeight;
+        const PixSrc* const srcLine = byteAdvance(src, ySrc * srcPitch);
+        PixTrg*       const trgLine = byteAdvance(trg, y    * trgPitch);
+
+        for (int x = 0; x < trgWidth; ++x)
+        {
+            const int xSrc = srcWidth * x / trgWidth;
+            trgLine[x] = pixCvrt(srcLine[xSrc]);
+        }
+    }
+}
+
+
+//nearest-neighbor (going over source image - fast for upscaling, since source is read only once
+template <class PixSrc, class PixTrg, class PixConverter>
+void nearestNeighborScaleOverSource(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch,
+                                    /**/  PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
+                                    int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
+{
+    static_assert(std::is_integral<PixSrc>::value, "PixSrc* is expected to be cast-able to char*");
+    static_assert(std::is_integral<PixTrg>::value, "PixTrg* is expected to be cast-able to char*");
+
+    static_assert(std::is_same<decltype(pixCvrt(PixSrc())), PixTrg>::value, "PixConverter returning wrong pixel format");
+
+    if (srcPitch < srcWidth * static_cast<int>(sizeof(PixSrc))  ||
+        trgPitch < trgWidth * static_cast<int>(sizeof(PixTrg)))
+    {
+        assert(false);
+        return;
+    }
+
+    yFirst = std::max(yFirst, 0);
+    yLast  = std::min(yLast, srcHeight);
+    if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0) return;
+
+    for (int y = yFirst; y < yLast; ++y)
+    {
+        //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
+        // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight
+
+        //keep within for loop to support MT input slices!
+        const int yTrgFirst = ( y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
+        const int yTrgLast  = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
+        const int blockHeight = yTrgLast - yTrgFirst;
+
+        if (blockHeight > 0)
+        {
+            const PixSrc* srcLine = byteAdvance(src, y         * srcPitch);
+            /**/  PixTrg* trgLine = byteAdvance(trg, yTrgFirst * trgPitch);
+            int xTrgFirst = 0;
+
+            for (int x = 0; x < srcWidth; ++x)
+            {
+                const int xTrgLast = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
+                const int blockWidth = xTrgLast - xTrgFirst;
+                if (blockWidth > 0)
+                {
+                    xTrgFirst = xTrgLast;
+
+                    const auto trgPix = pixCvrt(srcLine[x]);
+                    fillBlock(trgLine, trgPitch, trgPix, blockWidth, blockHeight);
+                    trgLine += blockWidth;
+                }
+            }
+        }
+    }
+}
+
+
+template <class PixTrg, class PixConverter>
+void bilinearScale(const uint32_t* src, int srcWidth, int srcHeight, int srcPitch,
+                   /**/    PixTrg* trg, int trgWidth, int trgHeight, int trgPitch,
+                   int yFirst, int yLast, PixConverter pixCvrt /*convert uint32_t to PixTrg*/)
+{
+    static_assert(std::is_integral<PixTrg>::value,                            "PixTrg* is expected to be cast-able to char*");
+    static_assert(std::is_same<decltype(pixCvrt(uint32_t())), PixTrg>::value, "PixConverter returning wrong pixel format");
+
+    if (srcPitch < srcWidth * static_cast<int>(sizeof(uint32_t)) ||
+        trgPitch < trgWidth * static_cast<int>(sizeof(PixTrg)))
+    {
+        assert(false);
+        return;
+    }
+
+    yFirst = std::max(yFirst, 0);
+    yLast  = std::min(yLast, trgHeight);
+    if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0) return;
+
+    const double scaleX = static_cast<double>(trgWidth ) / srcWidth;
+    const double scaleY = static_cast<double>(trgHeight) / srcHeight;
+
+    //perf notes:
+    //    -> double-based calculation is (slightly) faster than float
+    //    -> pre-calculation gives significant boost; std::vector<> memory allocation is negligible!
+    struct CoeffsX
+    {
+        int     x1 = 0;
+        int     x2 = 0;
+        double xx1 = 0;
+        double x2x = 0;
+    };
+    std::vector<CoeffsX> buf(trgWidth);
+    for (int x = 0; x < trgWidth; ++x)
+    {
+        const int x1 = srcWidth * x / trgWidth;
+        int x2 = x1 + 1;
+        if (x2 == srcWidth) --x2;
+
+        const double xx1 = x / scaleX - x1;
+        const double x2x = 1 - xx1;
+
+        //buf[x] = { x1, x2, xx1, x2x };
+        struct CoeffsX coeffsX;
+        coeffsX.x1 = x1;
+        coeffsX.x2 = x2;
+        coeffsX.xx1 = xx1;
+        coeffsX.x2x = x2x;
+        buf[x] = coeffsX;
+    }
+
+    for (int y = yFirst; y < yLast; ++y)
+    {
+        const int y1 = srcHeight * y / trgHeight;
+        int y2 = y1 + 1;
+        if (y2 == srcHeight) --y2;
+
+        const double yy1 = y / scaleY - y1;
+        const double y2y = 1 - yy1;
+
+        const uint32_t* const srcLine     = byteAdvance(src, y1 * srcPitch);
+        const uint32_t* const srcLineNext = byteAdvance(src, y2 * srcPitch);
+        PixTrg*         const trgLine     = byteAdvance(trg, y  * trgPitch);
+
+        for (int x = 0; x < trgWidth; ++x)
+        {
+            //perf: do NOT "simplify" the variable layout without measurement!
+            const int     x1 = buf[x].x1;
+            const int     x2 = buf[x].x2;
+            const double xx1 = buf[x].xx1;
+            const double x2x = buf[x].x2x;
+
+            const double x2xy2y = x2x * y2y;
+            const double xx1y2y = xx1 * y2y;
+            const double x2xyy1 = x2x * yy1;
+            const double xx1yy1 = xx1 * yy1;
+
+            auto interpolate = [=](int offset)
+            {
+                /* https://en.wikipedia.org/wiki/Bilinear_interpolation
+                     (c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
+                     (c12(x2 - x) + c22(x - x1)) * (y  - y1)                          */
+                const auto c11 = (srcLine    [x1] >> (8 * offset)) & 0xff;
+                const auto c21 = (srcLine    [x2] >> (8 * offset)) & 0xff;
+                const auto c12 = (srcLineNext[x1] >> (8 * offset)) & 0xff;
+                const auto c22 = (srcLineNext[x2] >> (8 * offset)) & 0xff;
+
+                return c11 * x2xy2y + c21 * xx1y2y +
+                       c12 * x2xyy1 + c22 * xx1yy1;
+            };
+
+            const double bi = interpolate(0);
+            const double gi = interpolate(1);
+            const double ri = interpolate(2);
+            const double ai = interpolate(3);
+
+            const auto b = static_cast<uint32_t>(bi + 0.5);
+            const auto g = static_cast<uint32_t>(gi + 0.5);
+            const auto r = static_cast<uint32_t>(ri + 0.5);
+            const auto a = static_cast<uint32_t>(ai + 0.5);
+
+            const uint32_t trgPix = (a << 24) | (r << 16) | (g << 8) | b;
+
+            trgLine[x] = pixCvrt(trgPix);
+        }
+    }
+}
+}
+
+#endif //XBRZ_TOOLS_H_825480175091875
--- a/src/filters/xbrzfilter.cpp
+++ b/src/filters/xbrzfilter.cpp
@ -4,25 +4,25 @@

 void xbrz2x32(uint8_t *srcPtr, uint32_t srcPitch, uint8_t * /* deltaPtr */, uint8_t *dstPtr, uint32_t dstPitch, int width, int height)
 {
-    xbrz::scale(2, (const uint32_t *)srcPtr, width, height, srcPitch, (uint32_t *)dstPtr, dstPitch, xbrz::RGB);
+    xbrz::scale(2, (const uint32_t *)srcPtr, (uint32_t *)dstPtr, width, height, xbrz::ColorFormat::RGB, srcPitch, dstPitch);
 }

 void xbrz3x32(uint8_t *srcPtr, uint32_t srcPitch, uint8_t * /* deltaPtr */, uint8_t *dstPtr, uint32_t dstPitch, int width, int height)
 {
-    xbrz::scale(3, (const uint32_t *)srcPtr, width, height, srcPitch, (uint32_t *)dstPtr, dstPitch, xbrz::RGB);
+    xbrz::scale(3, (const uint32_t *)srcPtr, (uint32_t *)dstPtr, width, height, xbrz::ColorFormat::RGB, srcPitch, dstPitch);
 }

 void xbrz4x32(uint8_t *srcPtr, uint32_t srcPitch, uint8_t * /* deltaPtr */, uint8_t *dstPtr, uint32_t dstPitch, int width, int height)
 {
-    xbrz::scale(4, (const uint32_t *)srcPtr, width, height, srcPitch, (uint32_t *)dstPtr, dstPitch, xbrz::RGB);
+    xbrz::scale(4, (const uint32_t *)srcPtr, (uint32_t *)dstPtr, width, height, xbrz::ColorFormat::RGB, srcPitch, dstPitch);
 }

 void xbrz5x32(uint8_t *srcPtr, uint32_t srcPitch, uint8_t * /* deltaPtr */, uint8_t *dstPtr, uint32_t dstPitch, int width, int height)
 {
-    xbrz::scale(5, (const uint32_t *)srcPtr, width, height, srcPitch, (uint32_t *)dstPtr, dstPitch, xbrz::RGB);
+    xbrz::scale(5, (const uint32_t *)srcPtr, (uint32_t *)dstPtr, width, height, xbrz::ColorFormat::RGB, srcPitch, dstPitch);
 }

 void xbrz6x32(uint8_t *srcPtr, uint32_t srcPitch, uint8_t * /* deltaPtr */, uint8_t *dstPtr, uint32_t dstPitch, int width, int height)
 {
-    xbrz::scale(6, (const uint32_t *)srcPtr, width, height, srcPitch, (uint32_t *)dstPtr, dstPitch, xbrz::RGB);
+    xbrz::scale(6, (const uint32_t *)srcPtr, (uint32_t *)dstPtr, width, height, xbrz::ColorFormat::RGB, srcPitch, dstPitch);
 }
--- a/src/wx/panel.cpp
+++ b/src/wx/panel.cpp
@ -1542,6 +1542,7 @@ public:
        int outrb = systemColorDepth == 24 ? 0 : 4;
        int outstride = std::ceil(width * outbpp * scale) + outrb;
        delta += instride * procy;
+
        // FIXME: fugly hack
        if(gopts.render_method == RND_OPENGL)
            dst += (int)std::ceil(outstride * (procy + 1) * scale);
@ -1554,7 +1555,7 @@ public:
                return 0;
            }

-            src += instride;
+            //src += instride;

            // interframe blending filter
            // definitely not thread safe by default
@ -1589,7 +1590,7 @@ public:
                continue;
            }

-            src += instride * procy;
+            //src += instride * procy;

            // naturally, any of these with accumulation buffers like those of
            // the IFB filters will screw up royally as well