Video Filters: Improve performance of Deposterize and XBRZ CPU-based filters. (Based on PR #631.)
- Special thanks to @m42a for the inspiration on this code!
This commit is contained in:
parent
be51e41c04
commit
15f5b169cc
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright (C) 2016-2017 DeSmuME team
|
||||
Copyright (C) 2016-2024 DeSmuME team
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -21,83 +21,125 @@
|
|||
#define DEPOSTERIZE_THRESHOLD 23 // Possible values are [0-255], where lower a value prevents blending and a higher value allows for more blending
|
||||
|
||||
|
||||
static u32 Deposterize_InterpLTE(const u32 pixA, const u32 pixB)
|
||||
namespace
|
||||
{
|
||||
const u32 aB = (pixB & 0xFF000000) >> 24;
|
||||
if (aB == 0)
|
||||
template <u32 DEN>
|
||||
struct UnpackedPixel
|
||||
{
|
||||
return pixA;
|
||||
u32 r;
|
||||
u32 g;
|
||||
u32 b;
|
||||
u32 a;
|
||||
|
||||
u32 pack() const
|
||||
{
|
||||
return ( ((r/DEN) << 0) |
|
||||
((g/DEN) << 8) |
|
||||
((b/DEN) << 16) |
|
||||
((a/DEN) << 24) );
|
||||
}
|
||||
|
||||
const u32 rA = (pixA & 0x000000FF);
|
||||
const u32 gA = (pixA & 0x0000FF00) >> 8;
|
||||
const u32 bA = (pixA & 0x00FF0000) >> 16;
|
||||
const u32 aA = (pixA & 0xFF000000) >> 24;
|
||||
|
||||
const u32 rB = (pixB & 0x000000FF);
|
||||
const u32 gB = (pixB & 0x0000FF00) >> 8;
|
||||
const u32 bB = (pixB & 0x00FF0000) >> 16;
|
||||
|
||||
const u32 rC = ( (rB - rA <= DEPOSTERIZE_THRESHOLD) || (rA - rB <= DEPOSTERIZE_THRESHOLD) ) ? ((rA+rB)>>1) : rA;
|
||||
const u32 gC = ( (gB - gA <= DEPOSTERIZE_THRESHOLD) || (gA - gB <= DEPOSTERIZE_THRESHOLD) ) ? ((gA+gB)>>1) : gA;
|
||||
const u32 bC = ( (bB - bA <= DEPOSTERIZE_THRESHOLD) || (bA - bB <= DEPOSTERIZE_THRESHOLD) ) ? ((bA+bB)>>1) : bA;
|
||||
const u32 aC = ( (aB - aA <= DEPOSTERIZE_THRESHOLD) || (aA - aB <= DEPOSTERIZE_THRESHOLD) ) ? ((aA+aB)>>1) : aA;
|
||||
|
||||
return (rC | (gC << 8) | (bC << 16) | (aC << 24));
|
||||
}
|
||||
|
||||
static u32 Deposterize_Blend(const u32 pixA, const u32 pixB, const u32 weightA, const u32 weightB)
|
||||
{
|
||||
const u32 aB = (pixB & 0xFF000000) >> 24;
|
||||
if (aB == 0)
|
||||
{
|
||||
return pixA;
|
||||
}
|
||||
|
||||
const u32 rbA = pixA & 0x00FF00FF;
|
||||
const u32 gA = pixA & 0x0000FF00;
|
||||
const u32 aA = (pixA & 0xFF000000) >> 24;
|
||||
|
||||
const u32 rbB = pixB & 0x00FF00FF;
|
||||
const u32 gB = pixB & 0x0000FF00;
|
||||
|
||||
// Note: The sum of weightA and weightB must equal 16.
|
||||
const u32 rbC = ( ((rbA * weightA) + (rbB * weightB)) / 16 ) & 0x00FF00FF;
|
||||
const u32 gC = ( (( gA * weightA) + ( gB * weightB)) / 16 ) & 0x0000FF00;
|
||||
const u32 aC = ( (( aA * weightA) + ( aB * weightB)) / 16 ) << 24;
|
||||
|
||||
return (rbC | gC | aC);
|
||||
}
|
||||
|
||||
static u32 Deposterize_BlendPixel(const u32 color[9])
|
||||
{
|
||||
const u32 blend[9] = {
|
||||
color[0],
|
||||
Deposterize_InterpLTE(color[0], color[1]),
|
||||
Deposterize_InterpLTE(color[0], color[2]),
|
||||
Deposterize_InterpLTE(color[0], color[3]),
|
||||
Deposterize_InterpLTE(color[0], color[4]),
|
||||
Deposterize_InterpLTE(color[0], color[5]),
|
||||
Deposterize_InterpLTE(color[0], color[6]),
|
||||
Deposterize_InterpLTE(color[0], color[7]),
|
||||
Deposterize_InterpLTE(color[0], color[8])
|
||||
};
|
||||
|
||||
return Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(blend[0], blend[5], 2, 14),
|
||||
Deposterize_Blend(blend[0], blend[1], 2, 14),
|
||||
8, 8),
|
||||
Deposterize_Blend(Deposterize_Blend(blend[0], blend[7], 2, 14),
|
||||
Deposterize_Blend(blend[0], blend[3], 2, 14),
|
||||
8, 8),
|
||||
8, 8),
|
||||
Deposterize_Blend(Deposterize_Blend(Deposterize_Blend(blend[0], blend[6], 7, 9),
|
||||
Deposterize_Blend(blend[0], blend[2], 7, 9),
|
||||
8, 8),
|
||||
Deposterize_Blend(Deposterize_Blend(blend[0], blend[8], 7, 9),
|
||||
Deposterize_Blend(blend[0], blend[4], 7, 9),
|
||||
8, 8),
|
||||
8, 8),
|
||||
12, 4);
|
||||
static FORCEINLINE UnpackedPixel<2> Deposterize_InterpLTE(const UnpackedPixel<1> &pixA, const UnpackedPixel<1> &pixB)
|
||||
{
|
||||
UnpackedPixel<2> pixOut = {
|
||||
pixA.r,
|
||||
pixA.g,
|
||||
pixA.b,
|
||||
pixA.a
|
||||
};
|
||||
|
||||
if (pixB.a == 0)
|
||||
{
|
||||
pixOut.r = pixOut.r << 1;
|
||||
pixOut.g = pixOut.g << 1;
|
||||
pixOut.b = pixOut.b << 1;
|
||||
pixOut.a = pixOut.a << 1;
|
||||
return pixOut;
|
||||
}
|
||||
|
||||
const s32 rDiff = pixA.r - pixB.r;
|
||||
const s32 gDiff = pixA.g - pixB.g;
|
||||
const s32 bDiff = pixA.b - pixB.b;
|
||||
const s32 aDiff = pixA.a - pixB.a;
|
||||
|
||||
pixOut.r = ( (-DEPOSTERIZE_THRESHOLD <= rDiff) && (rDiff <= DEPOSTERIZE_THRESHOLD) ) ? (pixOut.r + pixB.r) : (pixOut.r << 1);
|
||||
pixOut.g = ( (-DEPOSTERIZE_THRESHOLD <= gDiff) && (gDiff <= DEPOSTERIZE_THRESHOLD) ) ? (pixOut.g + pixB.g) : (pixOut.g << 1);
|
||||
pixOut.b = ( (-DEPOSTERIZE_THRESHOLD <= bDiff) && (bDiff <= DEPOSTERIZE_THRESHOLD) ) ? (pixOut.b + pixB.b) : (pixOut.b << 1);
|
||||
pixOut.a = ( (-DEPOSTERIZE_THRESHOLD <= aDiff) && (aDiff <= DEPOSTERIZE_THRESHOLD) ) ? (pixOut.a + pixB.a) : (pixOut.a << 1);
|
||||
|
||||
return pixOut;
|
||||
}
|
||||
|
||||
static FORCEINLINE UnpackedPixel<2> Deposterize_InterpLTE(const UnpackedPixel<1> &pixA, const u32 color32B)
|
||||
{
|
||||
const UnpackedPixel<1> pixB = {
|
||||
(color32B >> 0) & 0x000000FF,
|
||||
(color32B >> 8) & 0x000000FF,
|
||||
(color32B >> 16) & 0x000000FF,
|
||||
(color32B >> 24) & 0x000000FF
|
||||
};
|
||||
|
||||
return Deposterize_InterpLTE(pixA, pixB);
|
||||
}
|
||||
|
||||
template <u32 WEIGHTA, u32 WEIGHTB, u32 DEN>
|
||||
static FORCEINLINE UnpackedPixel<DEN*(WEIGHTA+WEIGHTB)> Deposterize_Blend(const UnpackedPixel<DEN> &pixA, const UnpackedPixel<DEN> &pixB)
|
||||
{
|
||||
UnpackedPixel<DEN*(WEIGHTA+WEIGHTB)> ret;
|
||||
ret.r = (pixA.r * WEIGHTA) + (pixB.r * WEIGHTB);
|
||||
ret.g = (pixA.g * WEIGHTA) + (pixB.g * WEIGHTB);
|
||||
ret.b = (pixA.b * WEIGHTA) + (pixB.b * WEIGHTB);
|
||||
ret.a = (pixA.a * WEIGHTA) + (pixB.a * WEIGHTB);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static u32 Deposterize_BlendPixel(const u32 color32[9])
|
||||
{
|
||||
const UnpackedPixel<1> center = {
|
||||
(color32[0] >> 0) & 0x000000FF,
|
||||
(color32[0] >> 8) & 0x000000FF,
|
||||
(color32[0] >> 16) & 0x000000FF,
|
||||
(color32[0] >> 24) & 0x000000FF
|
||||
};
|
||||
|
||||
const UnpackedPixel<2> center2 = {
|
||||
center.r << 1,
|
||||
center.g << 1,
|
||||
center.b << 1,
|
||||
center.a << 1
|
||||
};
|
||||
|
||||
#define DF_INTERP(i) Deposterize_InterpLTE(center, color32[i])
|
||||
|
||||
UnpackedPixel<512> pixOut = Deposterize_Blend<3, 1>(
|
||||
Deposterize_Blend<1, 1>(
|
||||
Deposterize_Blend<1, 1>(
|
||||
Deposterize_Blend<2, 14>(center2, DF_INTERP(5)),
|
||||
Deposterize_Blend<2, 14>(center2, DF_INTERP(1))
|
||||
),
|
||||
Deposterize_Blend<1, 1>(
|
||||
Deposterize_Blend<2, 14>(center2, DF_INTERP(7)),
|
||||
Deposterize_Blend<2, 14>(center2, DF_INTERP(3))
|
||||
)
|
||||
),
|
||||
Deposterize_Blend<1, 1>(
|
||||
Deposterize_Blend<1, 1>(
|
||||
Deposterize_Blend<7, 9>(center2, DF_INTERP(6)),
|
||||
Deposterize_Blend<7, 9>(center2, DF_INTERP(2))
|
||||
),
|
||||
Deposterize_Blend<1, 1>(
|
||||
Deposterize_Blend<7, 9>(center2, DF_INTERP(8)),
|
||||
Deposterize_Blend<7, 9>(center2, DF_INTERP(4))
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
#undef DF_INTERP
|
||||
|
||||
return pixOut.pack();
|
||||
}
|
||||
}
|
||||
|
||||
void RenderDeposterize(SSurface Src, SSurface Dst)
|
||||
|
|
|
@ -16,6 +16,9 @@
|
|||
// * do so, delete this exception statement from your version. *
|
||||
// ****************************************************************************
|
||||
|
||||
// 2024-08-01 (rogerman): Small performance optimization to
|
||||
// ColorDistanceARGB::dist(). (Special thanks to m42a
|
||||
// for this.)
|
||||
// 2016-03-04 (rogerman): Update to XBRZ 1.4.
|
||||
//
|
||||
// 2014-11-18 (rogerman): Update to XBRZ 1.1.
|
||||
|
@ -1152,23 +1155,31 @@ struct ColorDistanceARGB
|
|||
{
|
||||
static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
|
||||
{
|
||||
const double a1 = getAlpha(pix1) / 255.0 ;
|
||||
const double a2 = getAlpha(pix2) / 255.0 ;
|
||||
/*
|
||||
Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
|
||||
const int a1 = getAlpha(pix1);
|
||||
const int a2 = getAlpha(pix2);
|
||||
|
||||
1. if a1 = a2, distance should be: a1 * distYCbCr()
|
||||
2. if a1 = 0, distance should be: a2 * distYCbCr(black, white) = a2 * 255
|
||||
3. if a1 = 1, ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
|
||||
*/
|
||||
// Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
|
||||
|
||||
// 1. if a1 = a2, distance should be: a1 * distYCbCr()
|
||||
// 2. if a1 = 0, distance should be: a2 * distYCbCr(black, white) = a2 * 255
|
||||
// 3. if a1 = 1, ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
|
||||
|
||||
if (a1 == 0)
|
||||
return a2;
|
||||
if (a2 == 0)
|
||||
return a1;
|
||||
|
||||
//return std::min(a1, a2) * DistYCbCrBuffer::dist(pix1, pix2) + 255 * abs(a1 - a2);
|
||||
//=> following code is 15% faster:
|
||||
const double d = DistYCbCrBuffer::dist(pix1, pix2);
|
||||
if (a1 == 255 && a2 == 255)
|
||||
return d;
|
||||
if (a1 == a2)
|
||||
return a1 * d / 255.0;
|
||||
if (a1 < a2)
|
||||
return a1 * d + 255 * (a2 - a1);
|
||||
return a1 * d / 255.0 + (a2 - a1);
|
||||
else
|
||||
return a2 * d + 255 * (a1 - a2);
|
||||
return a2 * d / 255.0 + (a1 - a2);
|
||||
|
||||
//alternative? return std::sqrt(a1 * a2 * square(DistYCbCrBuffer::dist(pix1, pix2)) + square(255 * (a1 - a2)));
|
||||
}
|
||||
|
|
|
@ -13,6 +13,9 @@
|
|||
// * do so, delete this exception statement from your version. *
|
||||
// ****************************************************************************
|
||||
|
||||
// 2024-08-01 (rogerman): Small performance optimization to
|
||||
// ColorDistanceARGB::dist(). (Special thanks to m42a
|
||||
// for this.)
|
||||
// 2016-03-04 (rogerman): Update to XBRZ 1.4.
|
||||
//
|
||||
// 2014-11-18 (rogerman): Update to XBRZ 1.1.
|
||||
|
|
Loading…
Reference in New Issue