Video Filters: Improve performance of Deposterize and XBRZ CPU-based filters. (Based on PR #631.)

- Special thanks to @m42a for the inspiration on this code!
This commit is contained in:
rogerman 2024-08-01 21:07:24 -07:00
parent be51e41c04
commit 15f5b169cc
3 changed files with 200 additions and 144 deletions

View File

@ -1,5 +1,5 @@
/*
Copyright (C) 2016-2017 DeSmuME team
Copyright (C) 2016-2024 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -21,83 +21,125 @@
#define DEPOSTERIZE_THRESHOLD 23 // Possible values are [0-255], where lower a value prevents blending and a higher value allows for more blending
static u32 Deposterize_InterpLTE(const u32 pixA, const u32 pixB)
namespace
{
const u32 aB = (pixB & 0xFF000000) >> 24;
if (aB == 0)
template <u32 DEN>
struct UnpackedPixel
{
return pixA;
u32 r;
u32 g;
u32 b;
u32 a;
u32 pack() const
{
return ( ((r/DEN) << 0) |
((g/DEN) << 8) |
((b/DEN) << 16) |
((a/DEN) << 24) );
}
const u32 rA = (pixA & 0x000000FF);
const u32 gA = (pixA & 0x0000FF00) >> 8;
const u32 bA = (pixA & 0x00FF0000) >> 16;
const u32 aA = (pixA & 0xFF000000) >> 24;
const u32 rB = (pixB & 0x000000FF);
const u32 gB = (pixB & 0x0000FF00) >> 8;
const u32 bB = (pixB & 0x00FF0000) >> 16;
const u32 rC = ( (rB - rA <= DEPOSTERIZE_THRESHOLD) || (rA - rB <= DEPOSTERIZE_THRESHOLD) ) ? ((rA+rB)>>1) : rA;
const u32 gC = ( (gB - gA <= DEPOSTERIZE_THRESHOLD) || (gA - gB <= DEPOSTERIZE_THRESHOLD) ) ? ((gA+gB)>>1) : gA;
const u32 bC = ( (bB - bA <= DEPOSTERIZE_THRESHOLD) || (bA - bB <= DEPOSTERIZE_THRESHOLD) ) ? ((bA+bB)>>1) : bA;
const u32 aC = ( (aB - aA <= DEPOSTERIZE_THRESHOLD) || (aA - aB <= DEPOSTERIZE_THRESHOLD) ) ? ((aA+aB)>>1) : aA;
return (rC | (gC << 8) | (bC << 16) | (aC << 24));
}
static u32 Deposterize_Blend(const u32 pixA, const u32 pixB, const u32 weightA, const u32 weightB)
{
const u32 aB = (pixB & 0xFF000000) >> 24;
if (aB == 0)
{
return pixA;
}
const u32 rbA = pixA & 0x00FF00FF;
const u32 gA = pixA & 0x0000FF00;
const u32 aA = (pixA & 0xFF000000) >> 24;
const u32 rbB = pixB & 0x00FF00FF;
const u32 gB = pixB & 0x0000FF00;
// Note: The sum of weightA and weightB must equal 16.
const u32 rbC = ( ((rbA * weightA) + (rbB * weightB)) / 16 ) & 0x00FF00FF;
const u32 gC = ( (( gA * weightA) + ( gB * weightB)) / 16 ) & 0x0000FF00;
const u32 aC = ( (( aA * weightA) + ( aB * weightB)) / 16 ) << 24;
return (rbC | gC | aC);
}
static u32 Deposterize_BlendPixel(const u32 color[9])
{
const u32 blend[9] = {
color[0],
Deposterize_InterpLTE(color[0], color[1]),
Deposterize_InterpLTE(color[0], color[2]),
Deposterize_InterpLTE(color[0], color[3]),
Deposterize_InterpLTE(color[0], color[4]),
Deposterize_InterpLTE(color[0], color[5]),
Deposterize_InterpLTE(color[0], color[6]),
Deposterize_InterpLTE(color[0], color[7]),
Deposterize_InterpLTE(color[0], color[8])
};
return Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(blend[0], blend[5], 2, 14),
Deposterize_Blend(blend[0], blend[1], 2, 14),
8, 8),
Deposterize_Blend(Deposterize_Blend(blend[0], blend[7], 2, 14),
Deposterize_Blend(blend[0], blend[3], 2, 14),
8, 8),
8, 8),
Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(blend[0], blend[6], 7, 9),
Deposterize_Blend(blend[0], blend[2], 7, 9),
8, 8),
Deposterize_Blend(Deposterize_Blend(blend[0], blend[8], 7, 9),
Deposterize_Blend(blend[0], blend[4], 7, 9),
8, 8),
8, 8),
12, 4);
static FORCEINLINE UnpackedPixel<2> Deposterize_InterpLTE(const UnpackedPixel<1> &pixA, const UnpackedPixel<1> &pixB)
{
UnpackedPixel<2> pixOut = {
pixA.r,
pixA.g,
pixA.b,
pixA.a
};
if (pixB.a == 0)
{
pixOut.r = pixOut.r << 1;
pixOut.g = pixOut.g << 1;
pixOut.b = pixOut.b << 1;
pixOut.a = pixOut.a << 1;
return pixOut;
}
const s32 rDiff = pixA.r - pixB.r;
const s32 gDiff = pixA.g - pixB.g;
const s32 bDiff = pixA.b - pixB.b;
const s32 aDiff = pixA.a - pixB.a;
pixOut.r = ( (-DEPOSTERIZE_THRESHOLD <= rDiff) && (rDiff <= DEPOSTERIZE_THRESHOLD) ) ? (pixOut.r + pixB.r) : (pixOut.r << 1);
pixOut.g = ( (-DEPOSTERIZE_THRESHOLD <= gDiff) && (gDiff <= DEPOSTERIZE_THRESHOLD) ) ? (pixOut.g + pixB.g) : (pixOut.g << 1);
pixOut.b = ( (-DEPOSTERIZE_THRESHOLD <= bDiff) && (bDiff <= DEPOSTERIZE_THRESHOLD) ) ? (pixOut.b + pixB.b) : (pixOut.b << 1);
pixOut.a = ( (-DEPOSTERIZE_THRESHOLD <= aDiff) && (aDiff <= DEPOSTERIZE_THRESHOLD) ) ? (pixOut.a + pixB.a) : (pixOut.a << 1);
return pixOut;
}
static FORCEINLINE UnpackedPixel<2> Deposterize_InterpLTE(const UnpackedPixel<1> &pixA, const u32 color32B)
{
const UnpackedPixel<1> pixB = {
(color32B >> 0) & 0x000000FF,
(color32B >> 8) & 0x000000FF,
(color32B >> 16) & 0x000000FF,
(color32B >> 24) & 0x000000FF
};
return Deposterize_InterpLTE(pixA, pixB);
}
template <u32 WEIGHTA, u32 WEIGHTB, u32 DEN>
static FORCEINLINE UnpackedPixel<DEN*(WEIGHTA+WEIGHTB)> Deposterize_Blend(const UnpackedPixel<DEN> &pixA, const UnpackedPixel<DEN> &pixB)
{
UnpackedPixel<DEN*(WEIGHTA+WEIGHTB)> ret;
ret.r = (pixA.r * WEIGHTA) + (pixB.r * WEIGHTB);
ret.g = (pixA.g * WEIGHTA) + (pixB.g * WEIGHTB);
ret.b = (pixA.b * WEIGHTA) + (pixB.b * WEIGHTB);
ret.a = (pixA.a * WEIGHTA) + (pixB.a * WEIGHTB);
return ret;
}
static u32 Deposterize_BlendPixel(const u32 color32[9])
{
const UnpackedPixel<1> center = {
(color32[0] >> 0) & 0x000000FF,
(color32[0] >> 8) & 0x000000FF,
(color32[0] >> 16) & 0x000000FF,
(color32[0] >> 24) & 0x000000FF
};
const UnpackedPixel<2> center2 = {
center.r << 1,
center.g << 1,
center.b << 1,
center.a << 1
};
#define DF_INTERP(i) Deposterize_InterpLTE(center, color32[i])
UnpackedPixel<512> pixOut = Deposterize_Blend<3, 1>(
Deposterize_Blend<1, 1>(
Deposterize_Blend<1, 1>(
Deposterize_Blend<2, 14>(center2, DF_INTERP(5)),
Deposterize_Blend<2, 14>(center2, DF_INTERP(1))
),
Deposterize_Blend<1, 1>(
Deposterize_Blend<2, 14>(center2, DF_INTERP(7)),
Deposterize_Blend<2, 14>(center2, DF_INTERP(3))
)
),
Deposterize_Blend<1, 1>(
Deposterize_Blend<1, 1>(
Deposterize_Blend<7, 9>(center2, DF_INTERP(6)),
Deposterize_Blend<7, 9>(center2, DF_INTERP(2))
),
Deposterize_Blend<1, 1>(
Deposterize_Blend<7, 9>(center2, DF_INTERP(8)),
Deposterize_Blend<7, 9>(center2, DF_INTERP(4))
)
)
);
#undef DF_INTERP
return pixOut.pack();
}
}
void RenderDeposterize(SSurface Src, SSurface Dst)

View File

@ -16,6 +16,9 @@
// * do so, delete this exception statement from your version. *
// ****************************************************************************
// 2024-08-01 (rogerman): Small performance optimization to
// ColorDistanceARGB::dist(). (Special thanks to m42a
// for this.)
// 2016-03-04 (rogerman): Update to XBRZ 1.4.
//
// 2014-11-18 (rogerman): Update to XBRZ 1.1.
@ -1152,23 +1155,31 @@ struct ColorDistanceARGB
{
static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
{
const double a1 = getAlpha(pix1) / 255.0 ;
const double a2 = getAlpha(pix2) / 255.0 ;
/*
Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
const int a1 = getAlpha(pix1);
const int a2 = getAlpha(pix2);
1. if a1 = a2, distance should be: a1 * distYCbCr()
2. if a1 = 0, distance should be: a2 * distYCbCr(black, white) = a2 * 255
3. if a1 = 1, ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
*/
// Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
// 1. if a1 = a2, distance should be: a1 * distYCbCr()
// 2. if a1 = 0, distance should be: a2 * distYCbCr(black, white) = a2 * 255
// 3. if a1 = 1, ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
if (a1 == 0)
return a2;
if (a2 == 0)
return a1;
//return std::min(a1, a2) * DistYCbCrBuffer::dist(pix1, pix2) + 255 * abs(a1 - a2);
//=> following code is 15% faster:
const double d = DistYCbCrBuffer::dist(pix1, pix2);
if (a1 == 255 && a2 == 255)
return d;
if (a1 == a2)
return a1 * d / 255.0;
if (a1 < a2)
return a1 * d + 255 * (a2 - a1);
return a1 * d / 255.0 + (a2 - a1);
else
return a2 * d + 255 * (a1 - a2);
return a2 * d / 255.0 + (a1 - a2);
//alternative? return std::sqrt(a1 * a2 * square(DistYCbCrBuffer::dist(pix1, pix2)) + square(255 * (a1 - a2)));
}

View File

@ -13,6 +13,9 @@
// * do so, delete this exception statement from your version. *
// ****************************************************************************
// 2024-08-01 (rogerman): Small performance optimization to
// ColorDistanceARGB::dist(). (Special thanks to m42a
// for this.)
// 2016-03-04 (rogerman): Update to XBRZ 1.4.
//
// 2014-11-18 (rogerman): Update to XBRZ 1.1.