2022-11-20 14:21:20 +00:00
//_____________________________________________________________/\_______________________________________________________________
//==============================================================================================================================
//
// [CAS] FIDELITY FX - CONSTRAST ADAPTIVE SHARPENING 1.20190610
//
//==============================================================================================================================
// LICENSE
// =======
// Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved.
// -------
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
// -------
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
// Software.
// -------
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//------------------------------------------------------------------------------------------------------------------------------
// ABOUT
// =====
// CAS is a spatial only filter.
// CAS takes RGB color input.
// CAS enchances sharpness and local high-frequency contrast, and with or without added upsampling.
// CAS outputs RGB color.
//------------------------------------------------------------------------------------------------------------------------------
// SUGGESTIONS FOR INTEGRATION
// ===========================
// Best for performance, run CAS in sharpen-only mode, choose a video mode to have scan-out or the display scale.
// - Sharpen-only mode is faster, and provides a better quality sharpening.
// The scaling support in CAS was designed for when the application wants to do Dynamic Resolution Scaling (DRS).
// - With DRS, the render resolution can change per frame.
// - Use CAS to sharpen and upsample to the fixed output resolution, then composite the full resolution UI over CAS output.
// - This can all happen in one compute dispatch.
// It is likely better to reduce the amount of film grain which happens before CAS (as CAS will amplify grain).
// - An alternative would be to add grain after CAS.
// It is best to run CAS after tonemapping.
// - CAS needs to have input value 1.0 at the peak of the display output.
// It is ok to run CAS after compositing UI (it won't harm the UI).
//------------------------------------------------------------------------------------------------------------------------------
// EXECUTION
// =========
// CAS runs as a compute shader.
// CAS is designed to be run either in a 32-bit, CasFilter(), or packed 16-bit, CasFilterH(), form.
// The 32-bit form works on 8x8 tiles via one {64,1,1} workgroup.
// The 16-bit form works on a pair of 8x8 tiles in a 16x8 configuration via one {64,1,1} workgroup.
// CAS is designed to work best in semi-persistent form if running not async with graphics.
// For 32-bit this means looping across a collection of 4 8x8 tiles in a 2x2 tile foot-print.
// For 16-bit this means looping 2 times, once for the top 16x8 region and once for the bottom 16x8 region.
//------------------------------------------------------------------------------------------------------------------------------
// INTEGRATION SUMMARY FOR CPU
// ===========================
// // Make sure <stdint.h> has already been included.
// // Setup pre-portability-header defines.
// #define A_CPU 1
// // Include the portability header (requires version 1.20190530 or later which is backwards compatible).
// #include "ffx_a.h"
// // Include the CAS header.
// #include "ffx_cas.h"
// ...
// // Call the setup function to build out the constants for the shader, pass these to the shader.
// // The 'varAU4(const0);' expands into 'uint32_t const0[4];' on the CPU.
// varAU4(const0);
// varAU4(const1);
// CasSetup(const0,const1,
// 0.0f, // Sharpness tuning knob (0.0 to 1.0).
// 1920.0f,1080.0f, // Example input size.
// 2560.0f,1440.0f); // Example output size.
// ...
// // Later dispatch the shader based on the amount of semi-persistent loop unrolling.
// // Here is an example for running with the 16x16 (4-way unroll for 32-bit or 2-way unroll for 16-bit)
// vkCmdDispatch(cmdBuf,(widthInPixels+15)>>4,(heightInPixels+15)>>4,1);
//------------------------------------------------------------------------------------------------------------------------------
// INTEGRATION SUMMARY FOR GPU
// ===========================
// // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT.
// layout(set=0,binding=0,rgba16f)uniform image2D imgSrc;
// layout(set=0,binding=1,rgba16f)uniform image2D imgDst;
// ...
// // Setup pre-portability-header defines (sets up GLSL/HLSL path, packed math support, etc)
// #define A_GPU 1
// #define A_GLSL 1
// #define A_HALF 1
// ...
// // Include the portability header (or copy it in without an include).
// #include "ffx_a.h"
// ...
// // Define the fetch function(s).
// // CasLoad() takes a 32-bit unsigned integer 2D coordinate and loads color.
// AF3 CasLoad(ASU2 p){return imageLoad(imgSrc,p).rgb;}
// // CasLoadH() is the 16-bit version taking 16-bit unsigned integer 2D coordinate and loading 16-bit float color.
// // The ASU2() typecast back to 32-bit is a NO-OP, the compiler pattern matches and uses A16 opcode support instead.
// // The AH3() typecast to 16-bit float is a NO-OP, the compiler pattern matches and uses D16 opcode support instead.
// AH3 CasLoadH(ASW2 p){return AH3(imageLoad(imgSrc,ASU2(p)).rgb);}
2022-11-20 23:04:05 +00:00
// // If you define CAS_TEXTURE and/or CAS_TEXTUREH to a type, a value of that type will be added as the first input to CasFilter and forwarded to CasLoad
// // This is useful for forwarding extra data to the load functions, and is required by MSL, which doesn't use global textures
2022-11-20 14:21:20 +00:00
// ...
// // Define the input modifiers as nop's initially.
// // See "INPUT FORMAT SPECIFIC CASES" below for specifics on what to place in these functions.
// void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){}
// void CasInputH(inout AH2 r,inout AH2 g,inout AH2 b){}
// ...
// // Include this CAS header file (or copy it in without an include).
// #include "ffx_cas.h"
// ...
// // Example in shader integration for loop-unrolled 16x16 case for 32-bit.
// layout(local_size_x=64)in;
// void main(){
// // Fetch constants from CasSetup().
// AU4 const0=cb.const0;
// AU4 const1=cb.const1;
// // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
// AU2 gxy=ARmp8x8(gl_LocalInvocationID.x)+AU2(gl_WorkGroupID.x<<4u,gl_WorkGroupID.y<<4u);
// // Filter.
// AF4 c;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
// gxy.x+=8u;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
// gxy.y+=8u;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
// gxy.x-=8u;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);}
// ...
// // Example for semi-persistent 16x16 but this time for packed math.
// // Use this before including 'cas.h' if not using the non-packed filter function.
// #define CAS_PACKED_ONLY 1
// ...
// layout(local_size_x=64)in;
// void main(){
// // Fetch constants from CasSetup().
// AU4 const0=cb.const0;
// AU4 const1=cb.const1;
// // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
// AU2 gxy=ARmp8x8(gl_LocalInvocationID.x)+AU2(gl_WorkGroupID.x<<4u,gl_WorkGroupID.y<<4u);
// // Filter.
// AH4 c0,c1;AH2 cR,cG,cB;
// CasFilterH(cR,cG,cB,gxy,const0,const1,false);
// // Extra work integrated after CAS would go here.
// ...
// // Suggest only running CasDepack() right before stores, to maintain packed math for any work after CasFilterH().
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(imgDst,ASU2(gxy),AF4(c0));
// imageStore(imgDst,ASU2(gxy)+ASU2(8,0),AF4(c1));
// gxy.y+=8u;
// CasFilterH(cR,cG,cB,gxy,const0,const1,false);
// ...
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(imgDst,ASU2(gxy),AF4(c0));
// imageStore(imgDst,ASU2(gxy)+ASU2(8,0),AF4(c1));}
//------------------------------------------------------------------------------------------------------------------------------
// CAS FILTERING LOGIC
// ===================
// CAS uses the minimal nearest 3x3 source texel window for filtering.
// The filter coefficients are radially symmetric (phase adaptive, computed per pixel based on output pixel center).
// The filter kernel adapts to local contrast (adjusting the negative lobe strength of the filter kernel).
//------------------------------------------------------------------------------------------------------------------------------
// CAS INPUT REQUIREMENTS
// ======================
// This is designed to be a linear filter.
// Running CAS on perceptual inputs will yield over-sharpening.
// Input must range between {0 to 1} for each color channel.
// CAS output will be {0 to 1} ranged as well.
// CAS does 5 loads, so any conversion applied during CasLoad() or CasInput() has a 5 load * 3 channel = 15x cost amplifier.
// - So input conversions need to be factored into the prior pass's output.
// - But if necessary use CasInput() instead of CasLoad(), as CasInput() works with packed color.
// - For CAS with scaling the amplifier is 12 load * 3 channel = 36x cost amplifier.
// Any conversion applied to output has a 3x cost amplifier (3 color channels).
// - Output conversions are substantially less expensive.
// Added VALU ops due to conversions will have visible cost as this shader is already quite VALU heavy.
// This filter does not function well on sRGB or gamma 2.2 non-linear data.
// This filter does not function on PQ non-linear data.
// - Due to the shape of PQ, the positive side of the ring created by the negative lobe tends to become over-bright.
//------------------------------------------------------------------------------------------------------------------------------
// INPUT FORMAT SPECIFIC CASES
// ===========================
// - FP16 with all non-negative values ranging {0 to 1}.
// - Use as is, filter is designed for linear input and output ranging {0 to 1}.
// ---------------------------
// - UNORM with linear conversion approximation.
// - This could be used for both sRGB or FreeSync2 native (gamma 2.2) cases.
// - Load/store with either 10:10:10:2 UNORM or 8:8:8:8 UNORM (aka VK_FORMAT_R8G8B8A8_UNORM).
// - Use gamma 2.0 conversion in CasInput(), as an approximation.
// - Modifications:
// // Change the CasInput*() function to square the inputs.
// void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){r*=r;g*=g;b*=b;}
// void CasInputH(inout AH2 r,inout AH2 g,inout AH2 b){r*=r;g*=g;b*=b;}
// ...
// // Do linear to gamma 2.0 before store.
// // Since it will be common to do processing after CAS, the filter function returns linear.
// c.r=sqrt(c.r);c.g=sqrt(c.g);c.b=sqrt(c.b);
// imageStore(imgDst,ASU2(gxy),c);
// ...
// // And for packed.
// CasFilterH(cR,cG,cB,gxy,const0,const1,true);
// cR=sqrt(cR);cG=sqrt(cG);cB=sqrt(cB);
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(img[0],ASU2(gxy),AF4(c0));
// imageStore(img[0],ASU2(gxy+AU2(8,0)),AF4(c1));
// ---------------------------
// - sRGB with slightly better quality and higher cost.
// - Use texelFetch() with sRGB format (VK_FORMAT_R8G8B8A8_SRGB) for loads (gets linear into shader).
// - Store to destination using UNORM (not sRGB) stores and do the linear to sRGB conversion in the shader.
// - Modifications:
// // Use texel fetch instead of image load (on GCN this will translate into an image load in the driver).
// // Hardware has sRGB to linear on loads (but in API only for read-only, aka texture instead of UAV/image).
// AF3 CasLoad(ASU2 p){return texelFetch(texSrc,p,0).rgb;}
// ...
// // Do linear to sRGB before store (GPU lacking hardware conversion support for linear to sRGB on store).
// c.r=AToSrgbF1(c.r);c.g=AToSrgbF1(c.g);c.b=AToSrgbF1(c.b);
// imageStore(imgDst,ASU2(gxy),c);
// ...
// // And for packed.
// CasFilterH(cR,cG,cB,gxy,const0,const1,true);
// cR=AToSrgbH2(cR);cG=AToSrgbH2(cG);cB=AToSrgbH2(cB);
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(img[0],ASU2(gxy),AF4(c0));
// imageStore(img[0],ASU2(gxy+AU2(8,0)),AF4(c1));
// ---------------------------
// - HDR10 output via scRGB.
// - Pass before CAS needs to write out linear Rec.2020 colorspace output (all positive values).
// - Write to FP16 with {0 to 1} mapped to {0 to maxNits} nits.
// - Where 'maxNits' is typically not 10000.
// - Instead set 'maxNits' to the nits level that the HDR TV starts to clip white.
// - This can be even as low as 1000 nits on some HDR TVs.
// - After CAS do matrix multiply to take Rec.2020 back to sRGB and multiply by 'maxNits/80.0'.
// - Showing GPU code below to generate constants, likely most need to use CPU code instead.
// - Keeping the GPU code here because it is easier to read in these docs.
// - Can use 'lpm.h' source to generate the conversion matrix for Rec.2020 to sRGB:
// // Output conversion matrix from sRGB to Rec.2020.
// AF3 conR,conG,conB;
// // Working space temporaries (Rec.2020).
// AF3 rgbToXyzXW;AF3 rgbToXyzYW;AF3 rgbToXyzZW;
// LpmColRgbToXyz(rgbToXyzXW,rgbToXyzYW,rgbToXyzZW,lpmCol2020R,lpmCol2020G,lpmCol2020B,lpmColD65);
// // Output space temporaries (Rec.709, same as sRGB primaries).
// AF3 rgbToXyzXO;AF3 rgbToXyzYO;AF3 rgbToXyzZO;
// LpmColRgbToXyz(rgbToXyzXO,rgbToXyzYO,rgbToXyzZO,lpmCol709R,lpmCol709G,lpmCol709B,lpmColD65);
// AF3 xyzToRgbRO;AF3 xyzToRgbGO;AF3 xyzToRgbBO;
// LpmMatInv3x3(xyzToRgbRO,xyzToRgbGO,xyzToRgbBO,rgbToXyzXO,rgbToXyzYO,rgbToXyzZO);
// // Generate the matrix.
// LpmMatMul3x3(conR,conG,conB,xyzToRgbRO,xyzToRgbGO,xyzToRgbBO,rgbToXyzXW,rgbToXyzYW,rgbToXyzZW);
// - Adjust the conversion matrix for the multiply by 'maxNits/80.0'.
// // After this the constants can be stored into a constant buffer.
// AF1 conScale=maxNits*ARcpF1(80.0);
// conR*=conScale;conG*=conScale;conB*=conScale;
// - After CAS do the matrix multiply (passing the fetched constants into the shader).
// outputR=dot(AF3(colorR,colorG,colorB),conR);
// outputG=dot(AF3(colorR,colorG,colorB),conG);
// outputB=dot(AF3(colorR,colorG,colorB),conB);
// - Hopefully no developer is taking scRGB as input to CAS.
// - If that was the case, the conversion matrix from sRGB to Rec.2020 can be built changing the above code.
// - Swap the 'lpmCol709*' and 'lpmCol2020*' inputs to LpmColRgbToXyz().
// - Then scale by '80.0/maxNits' instead of 'maxNits/80.0'.
// ---------------------------
// - HDR10 output via native 10:10:10:2.
// - Pass before CAS needs to write out linear Rec.2020 colorspace output (all positive values).
// - Write to FP16 with {0 to 1} mapped to {0 to maxNits} nits.
// - Where 'maxNits' is typically not 10000.
// - Instead set 'maxNits' to the nits level that the HDR TV starts to clip white.
// - This can be even as low as 1000 nits on some HDR TVs.
// - Hopefully no developer needs to take PQ as input here, but if so can use A to convert PQ to linear:
// // Where 'k0' is a constant of 'maxNits/10000.0'.
// colorR=AFromPqF1(colorR*k0);
// colorG=AFromPqF1(colorG*k0);
// colorB=AFromPqF1(colorB*k0);
// - After CAS convert from linear to PQ.
// // Where 'k1' is a constant of '10000.0/maxNits'.
// colorR=AToPqF1(colorR*k1);
// colorG=AToPqF1(colorG*k1);
// colorB=AToPqF1(colorB*k1);
// ---------------------------
// - Example of a bad idea for CAS input design.
// - Have the pass before CAS store out in 10:10:10:2 UNORM with gamma 2.0.
// - Store the output of CAS with sRGB to linear conversion, or with a gamma 2.2 conversion for FreeSync2 native.
// - This will drop precision because the inputs had been quantized to 10-bit,
// and the output is using a different tonal transform,
// so inputs and outputs won't align for similar values.
// - It might be "ok" for 8-bit/channel CAS output, but definately not a good idea for 10-bit/channel output.
//------------------------------------------------------------------------------------------------------------------------------
// ALGORITHM DESCRIPTION
// =====================
// This describes the algorithm with CAS_BETTER_DIAGONALS defined.
// The default is with CAS_BETTER_DIAGONALS not defined (which is faster).
// Starting with no scaling.
// CAS fetches a 3x3 neighborhood around the pixel 'e',
// a b c
// d(e)f
// g h i
// It then computes a 'soft' minimum and maximum,
// a b c b
// d e f * 0.5 + d e f * 0.5
// g h i h
// The minimum and maximums give an idea of local contrast.
// --- 1.0 ^
// | | <-- This minimum distance to the signal limit is divided by MAX to get a base sharpening amount 'A'.
// --- MAX v
// |
// |
// --- MIN ^
// | | <-- The MIN side is more distant in this example so it is not used, but for dark colors it would be used.
// | |
// --- 0.0 v
// The base sharpening amount 'A' from above is shaped with a sqrt().
// This 'A' ranges from 0 := no sharpening, to 1 := full sharpening.
// Then 'A' is scaled by the sharpness knob while being transformed to a negative lobe (values from -1/5 to -1/8 for A=1).
// The final filter kernel looks like this,
// 0 A 0
// A 1 A <-- Center is always 1.0, followed by the negative lobe 'A' in a ring, and windowed into a circle with the 0.0s.
// 0 A 0
// The local neighborhood is then multiplied by the kernel weights, summed and divided by the sum of the kernel weights.
// The high quality path computes filter weights per channel.
// The low quality path uses the green channel's filter weights to compute the 'A' factor for all channels.
// ---------------------
// The scaling path is a little more complex.
// It starts by fetching the 4x4 neighborhood around the pixel centered between centers of pixels {f,g,j,k},
// a b c d
// e(f g)h
// i(j k)l
// m n o p
// The algorithm then computes the no-scaling result for {f,g,j,k}.
// It then interpolates between those no-scaling results.
// The interpolation is adaptive.
// To hide bilinear interpolation and restore diagonals, it weights bilinear weights by 1/(const+contrast).
// Where 'contrast' is the soft 'max-min'.
// This makes edges thin out a little.
// ---------------------
// Without CAS_BETTER_DIAGONALS defined, the algorithm is a little faster.
// Instead of using the 3x3 "box" with the 5-tap "circle" this uses just the "circle".
// Drops to 5 texture fetches for no-scaling.
// Drops to 12 texture fetches for scaling.
// Drops a bunch of math.
//------------------------------------------------------------------------------------------------------------------------------
// IDEAS FOR FUTURE
// ================
// - Avoid V_CVT's by using denormals.
// - Manually pack FP16 literals.
//------------------------------------------------------------------------------------------------------------------------------
// CHANGE LOG
// ==========
// 20190610 - Misc documentation cleanup.
// 20190609 - Removed lowQuality bool, improved scaling logic.
// 20190530 - Unified CPU/GPU setup code, using new ffx_a.h, faster, define CAS_BETTER_DIAGONALS to get older slower one.
// 20190529 - Missing a good way to re-interpret packed in HLSL, so disabling approximation optimizations for now.
// 20190528 - Fixed so GPU CasSetup() generates half data all the time.
// 20190527 - Implemented approximations for rcp() and sqrt().
// 20190524 - New algorithm, adjustable sharpness, scaling to 4x area. Fixed checker debug for no-scaling only.
// 20190521 - Updated file naming.
// 20190516 - Updated docs, fixed workaround, fixed no-scaling quality issue, removed gamma2 and generalized as CasInput*().
// 20190510 - Made the dispatch example safely round up for images that are not a multiple of 16x16.
// 20190507 - Fixed typo bug in CAS_DEBUG_CHECKER, fixed sign typo in the docs.
// 20190503 - Setup temporary workaround for compiler bug.
// 20190502 - Added argument for 'gamma2' path so input transform in that case runs packed.
// 20190426 - Improved documentation on format specific cases, etc.
// 20190425 - Updated/corrected documentation.
// 20190405 - Added CAS_PACKED_ONLY, misc bug fixes.
// 20190404 - Updated for the new a.h header.
//==============================================================================================================================
// This is the practical limit for the algorithm's scaling ability (quality is limited by 3x3 taps). Example resolutions,
// 1280x720 -> 1080p = 2.25x area
// 1536x864 -> 1080p = 1.56x area
// 1792x1008 -> 1440p = 2.04x area
// 1920x1080 -> 1440p = 1.78x area
// 1920x1080 -> 4K = 4.0x area
// 2048x1152 -> 1440p = 1.56x area
// 2560x1440 -> 4K = 2.25x area
// 3072x1728 -> 4K = 1.56x area
# define CAS_AREA_LIMIT 4.0
//------------------------------------------------------------------------------------------------------------------------------
// Pass in output and input resolution in pixels.
// This returns true if CAS supports scaling in the given configuration.
AP1 CasSupportScaling ( AF1 outX , AF1 outY , AF1 inX , AF1 inY ) { return ( ( outX * outY ) * ARcpF1 ( inX * inY ) ) < = CAS_AREA_LIMIT ; }
//==============================================================================================================================
// Call to setup required constant values (works on CPU or GPU).
A_STATIC void CasSetup (
outAU4 const0 ,
outAU4 const1 ,
AF1 sharpness , // 0 := default (lower ringing), 1 := maximum (higest ringing)
AF1 inputSizeInPixelsX ,
AF1 inputSizeInPixelsY ,
AF1 outputSizeInPixelsX ,
AF1 outputSizeInPixelsY ) {
// Scaling terms.
const0 [ 0 ] = AU1_AF1 ( inputSizeInPixelsX * ARcpF1 ( outputSizeInPixelsX ) ) ;
const0 [ 1 ] = AU1_AF1 ( inputSizeInPixelsY * ARcpF1 ( outputSizeInPixelsY ) ) ;
const0 [ 2 ] = AU1_AF1 ( AF1_ ( 0.5 ) * inputSizeInPixelsX * ARcpF1 ( outputSizeInPixelsX ) - AF1_ ( 0.5 ) ) ;
const0 [ 3 ] = AU1_AF1 ( AF1_ ( 0.5 ) * inputSizeInPixelsY * ARcpF1 ( outputSizeInPixelsY ) - AF1_ ( 0.5 ) ) ;
// Sharpness value.
AF1 sharp = - ARcpF1 ( ALerpF1 ( 8.0 , 5.0 , ASatF1 ( sharpness ) ) ) ;
varAF2 ( hSharp ) = initAF2 ( sharp , 0.0 ) ;
const1 [ 0 ] = AU1_AF1 ( sharp ) ;
const1 [ 1 ] = AU1_AH2_AF2 ( hSharp ) ;
const1 [ 2 ] = AU1_AF1 ( AF1_ ( 8.0 ) * inputSizeInPixelsX * ARcpF1 ( outputSizeInPixelsX ) ) ;
2022-07-16 17:29:59 +00:00
const1 [ 3 ] = AU1 ( 0 ) ; }
2022-11-20 14:21:20 +00:00
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//_____________________________________________________________/\_______________________________________________________________
//==============================================================================================================================
// NON-PACKED VERSION
//==============================================================================================================================
# ifdef A_GPU
2022-11-20 23:04:05 +00:00
# if defined(A_MSL) && !defined(CAS_TEXTURE)
# define CAS_TEXTURE texture2d<float>
# endif
# ifdef CAS_TEXTURE
# define TEXCALL tex,
# define TEXINPUT CAS_TEXTURE tex,
# else
# define TEXCALL
# define TEXINPUT
# endif
2022-11-20 14:21:20 +00:00
# ifdef CAS_PACKED_ONLY
// Avoid compiler error.
2022-11-20 23:04:05 +00:00
A_STATIC AF3 CasLoad ( ASU2 p ) { return AF3 ( 0.0 , 0.0 , 0.0 ) ; }
// MSL Doesn't let you inout vector elements, so use a macro
# define CasInput(r,g,b)
2022-11-20 14:21:20 +00:00
# endif
//------------------------------------------------------------------------------------------------------------------------------
2022-11-20 23:04:05 +00:00
A_STATIC void CasFilter (
TEXINPUT
outAF1 pixR , // Output values, non-vector so port between CasFilter() and CasFilterH() is easy.
outAF1 pixG ,
outAF1 pixB ,
2022-11-20 14:21:20 +00:00
AU2 ip , // Integer pixel position in output.
AU4 const0 , // Constants generated by CasSetup().
AU4 const1 ,
AP1 noScaling ) { // Must be a compile-time literal value, true = sharpen only (no resize).
//------------------------------------------------------------------------------------------------------------------------------
// Debug a checker pattern of on/off tiles for visual inspection.
# ifdef CAS_DEBUG_CHECKER
if ( ( ( ( ip . x ^ ip . y ) > > 8u ) & 1u ) = = 0u ) { AF3 pix0 = CasLoad ( ASU2 ( ip ) ) ;
pixR = pix0 . r ; pixG = pix0 . g ; pixB = pix0 . b ; CasInput ( pixR , pixG , pixB ) ; return ; }
# endif
//------------------------------------------------------------------------------------------------------------------------------
// No scaling algorithm uses minimal 3x3 pixel neighborhood.
if ( noScaling ) {
// a b c
// d e f
// g h i
ASU2 sp = ASU2 ( ip ) ;
2022-11-20 23:04:05 +00:00
A_MAYBE_UNUSED AF3 a = CasLoad ( TEXCALL sp + ASU2 ( - 1 , - 1 ) ) ;
A_MAYBE_UNUSED AF3 b = CasLoad ( TEXCALL sp + ASU2 ( 0 , - 1 ) ) ;
A_MAYBE_UNUSED AF3 c = CasLoad ( TEXCALL sp + ASU2 ( 1 , - 1 ) ) ;
A_MAYBE_UNUSED AF3 d = CasLoad ( TEXCALL sp + ASU2 ( - 1 , 0 ) ) ;
A_MAYBE_UNUSED AF3 e = CasLoad ( TEXCALL sp ) ;
A_MAYBE_UNUSED AF3 f = CasLoad ( TEXCALL sp + ASU2 ( 1 , 0 ) ) ;
A_MAYBE_UNUSED AF3 g = CasLoad ( TEXCALL sp + ASU2 ( - 1 , 1 ) ) ;
A_MAYBE_UNUSED AF3 h = CasLoad ( TEXCALL sp + ASU2 ( 0 , 1 ) ) ;
A_MAYBE_UNUSED AF3 i = CasLoad ( TEXCALL sp + ASU2 ( 1 , 1 ) ) ;
2022-11-20 14:21:20 +00:00
// Run optional input transform.
CasInput ( a . r , a . g , a . b ) ;
CasInput ( b . r , b . g , b . b ) ;
CasInput ( c . r , c . g , c . b ) ;
CasInput ( d . r , d . g , d . b ) ;
CasInput ( e . r , e . g , e . b ) ;
CasInput ( f . r , f . g , f . b ) ;
CasInput ( g . r , g . g , g . b ) ;
CasInput ( h . r , h . g , h . b ) ;
CasInput ( i . r , i . g , i . b ) ;
// Soft min and max.
// a b c b
// d e f * 0.5 + d e f * 0.5
// g h i h
// These are 2.0x bigger (factored out the extra multiply).
AF1 mnR = AMin3F1 ( AMin3F1 ( d . r , e . r , f . r ) , b . r , h . r ) ;
AF1 mnG = AMin3F1 ( AMin3F1 ( d . g , e . g , f . g ) , b . g , h . g ) ;
AF1 mnB = AMin3F1 ( AMin3F1 ( d . b , e . b , f . b ) , b . b , h . b ) ;
# ifdef CAS_BETTER_DIAGONALS
AF1 mnR2 = AMin3F1 ( AMin3F1 ( mnR , a . r , c . r ) , g . r , i . r ) ;
AF1 mnG2 = AMin3F1 ( AMin3F1 ( mnG , a . g , c . g ) , g . g , i . g ) ;
AF1 mnB2 = AMin3F1 ( AMin3F1 ( mnB , a . b , c . b ) , g . b , i . b ) ;
mnR = mnR + mnR2 ;
mnG = mnG + mnG2 ;
mnB = mnB + mnB2 ;
# endif
AF1 mxR = AMax3F1 ( AMax3F1 ( d . r , e . r , f . r ) , b . r , h . r ) ;
AF1 mxG = AMax3F1 ( AMax3F1 ( d . g , e . g , f . g ) , b . g , h . g ) ;
AF1 mxB = AMax3F1 ( AMax3F1 ( d . b , e . b , f . b ) , b . b , h . b ) ;
# ifdef CAS_BETTER_DIAGONALS
AF1 mxR2 = AMax3F1 ( AMax3F1 ( mxR , a . r , c . r ) , g . r , i . r ) ;
AF1 mxG2 = AMax3F1 ( AMax3F1 ( mxG , a . g , c . g ) , g . g , i . g ) ;
AF1 mxB2 = AMax3F1 ( AMax3F1 ( mxB , a . b , c . b ) , g . b , i . b ) ;
mxR = mxR + mxR2 ;
mxG = mxG + mxG2 ;
mxB = mxB + mxB2 ;
# endif
// Smooth minimum distance to signal limit divided by smooth max.
# ifdef CAS_GO_SLOWER
AF1 rcpMR = ARcpF1 ( mxR ) ;
AF1 rcpMG = ARcpF1 ( mxG ) ;
AF1 rcpMB = ARcpF1 ( mxB ) ;
# else
AF1 rcpMR = APrxLoRcpF1 ( mxR ) ;
AF1 rcpMG = APrxLoRcpF1 ( mxG ) ;
AF1 rcpMB = APrxLoRcpF1 ( mxB ) ;
# endif
# ifdef CAS_BETTER_DIAGONALS
AF1 ampR = ASatF1 ( min ( mnR , AF1_ ( 2.0 ) - mxR ) * rcpMR ) ;
AF1 ampG = ASatF1 ( min ( mnG , AF1_ ( 2.0 ) - mxG ) * rcpMG ) ;
AF1 ampB = ASatF1 ( min ( mnB , AF1_ ( 2.0 ) - mxB ) * rcpMB ) ;
# else
AF1 ampR = ASatF1 ( min ( mnR , AF1_ ( 1.0 ) - mxR ) * rcpMR ) ;
AF1 ampG = ASatF1 ( min ( mnG , AF1_ ( 1.0 ) - mxG ) * rcpMG ) ;
AF1 ampB = ASatF1 ( min ( mnB , AF1_ ( 1.0 ) - mxB ) * rcpMB ) ;
# endif
// Shaping amount of sharpening.
# ifdef CAS_GO_SLOWER
ampR = sqrt ( ampR ) ;
ampG = sqrt ( ampG ) ;
ampB = sqrt ( ampB ) ;
# else
ampR = APrxLoSqrtF1 ( ampR ) ;
ampG = APrxLoSqrtF1 ( ampG ) ;
ampB = APrxLoSqrtF1 ( ampB ) ;
# endif
// Filter shape.
// 0 w 0
// w 1 w
// 0 w 0
2022-11-20 23:04:05 +00:00
A_MAYBE_UNUSED AF1 peak = AF1_AU1 ( const1 . x ) ;
A_MAYBE_UNUSED AF1 wR = ampR * peak ;
A_MAYBE_UNUSED AF1 wG = ampG * peak ;
A_MAYBE_UNUSED AF1 wB = ampB * peak ;
2022-11-20 14:21:20 +00:00
// Filter.
# ifndef CAS_SLOW
// Using green coef only, depending on dead code removal to strip out the extra overhead.
# ifdef CAS_GO_SLOWER
AF1 rcpWeight = ARcpF1 ( AF1_ ( 1.0 ) + AF1_ ( 4.0 ) * wG ) ;
# else
AF1 rcpWeight = APrxMedRcpF1 ( AF1_ ( 1.0 ) + AF1_ ( 4.0 ) * wG ) ;
# endif
pixR = ASatF1 ( ( b . r * wG + d . r * wG + f . r * wG + h . r * wG + e . r ) * rcpWeight ) ;
pixG = ASatF1 ( ( b . g * wG + d . g * wG + f . g * wG + h . g * wG + e . g ) * rcpWeight ) ;
pixB = ASatF1 ( ( b . b * wG + d . b * wG + f . b * wG + h . b * wG + e . b ) * rcpWeight ) ;
# else
# ifdef CAS_GO_SLOWER
AF1 rcpWeightR = ARcpF1 ( AF1_ ( 1.0 ) + AF1_ ( 4.0 ) * wR ) ;
AF1 rcpWeightG = ARcpF1 ( AF1_ ( 1.0 ) + AF1_ ( 4.0 ) * wG ) ;
AF1 rcpWeightB = ARcpF1 ( AF1_ ( 1.0 ) + AF1_ ( 4.0 ) * wB ) ;
# else
AF1 rcpWeightR = APrxMedRcpF1 ( AF1_ ( 1.0 ) + AF1_ ( 4.0 ) * wR ) ;
AF1 rcpWeightG = APrxMedRcpF1 ( AF1_ ( 1.0 ) + AF1_ ( 4.0 ) * wG ) ;
AF1 rcpWeightB = APrxMedRcpF1 ( AF1_ ( 1.0 ) + AF1_ ( 4.0 ) * wB ) ;
# endif
pixR = ASatF1 ( ( b . r * wR + d . r * wR + f . r * wR + h . r * wR + e . r ) * rcpWeightR ) ;
pixG = ASatF1 ( ( b . g * wG + d . g * wG + f . g * wG + h . g * wG + e . g ) * rcpWeightG ) ;
pixB = ASatF1 ( ( b . b * wB + d . b * wB + f . b * wB + h . b * wB + e . b ) * rcpWeightB ) ;
# endif
return ; }
//------------------------------------------------------------------------------------------------------------------------------
// Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm.
// a b c d
// e f g h
// i j k l
// m n o p
// Working these 4 results.
// +-----+-----+
// | | |
// | f..|..g |
// | . | . |
// +-----+-----+
// | . | . |
// | j..|..k |
// | | |
// +-----+-----+
AF2 pp = AF2 ( ip ) * AF2_AU2 ( const0 . xy ) + AF2_AU2 ( const0 . zw ) ;
AF2 fp = floor ( pp ) ;
pp - = fp ;
ASU2 sp = ASU2 ( fp ) ;
2022-11-20 23:04:05 +00:00
A_MAYBE_UNUSED AF3 a = CasLoad ( TEXCALL sp + ASU2 ( - 1 , - 1 ) ) ;
A_MAYBE_UNUSED AF3 b = CasLoad ( TEXCALL sp + ASU2 ( 0 , - 1 ) ) ;
A_MAYBE_UNUSED AF3 e = CasLoad ( TEXCALL sp + ASU2 ( - 1 , 0 ) ) ;
A_MAYBE_UNUSED AF3 f = CasLoad ( TEXCALL sp ) ;
A_MAYBE_UNUSED AF3 c = CasLoad ( TEXCALL sp + ASU2 ( 1 , - 1 ) ) ;
A_MAYBE_UNUSED AF3 d = CasLoad ( TEXCALL sp + ASU2 ( 2 , - 1 ) ) ;
A_MAYBE_UNUSED AF3 g = CasLoad ( TEXCALL sp + ASU2 ( 1 , 0 ) ) ;
A_MAYBE_UNUSED AF3 h = CasLoad ( TEXCALL sp + ASU2 ( 2 , 0 ) ) ;
A_MAYBE_UNUSED AF3 i = CasLoad ( TEXCALL sp + ASU2 ( - 1 , 1 ) ) ;
A_MAYBE_UNUSED AF3 j = CasLoad ( TEXCALL sp + ASU2 ( 0 , 1 ) ) ;
A_MAYBE_UNUSED AF3 m = CasLoad ( TEXCALL sp + ASU2 ( - 1 , 2 ) ) ;
A_MAYBE_UNUSED AF3 n = CasLoad ( TEXCALL sp + ASU2 ( 0 , 2 ) ) ;
A_MAYBE_UNUSED AF3 k = CasLoad ( TEXCALL sp + ASU2 ( 1 , 1 ) ) ;
A_MAYBE_UNUSED AF3 l = CasLoad ( TEXCALL sp + ASU2 ( 2 , 1 ) ) ;
A_MAYBE_UNUSED AF3 o = CasLoad ( TEXCALL sp + ASU2 ( 1 , 2 ) ) ;
A_MAYBE_UNUSED AF3 p = CasLoad ( TEXCALL sp + ASU2 ( 2 , 2 ) ) ;
2022-11-20 14:21:20 +00:00
// Run optional input transform.
CasInput ( a . r , a . g , a . b ) ;
CasInput ( b . r , b . g , b . b ) ;
CasInput ( c . r , c . g , c . b ) ;
CasInput ( d . r , d . g , d . b ) ;
CasInput ( e . r , e . g , e . b ) ;
CasInput ( f . r , f . g , f . b ) ;
CasInput ( g . r , g . g , g . b ) ;
CasInput ( h . r , h . g , h . b ) ;
CasInput ( i . r , i . g , i . b ) ;
CasInput ( j . r , j . g , j . b ) ;
CasInput ( k . r , k . g , k . b ) ;
CasInput ( l . r , l . g , l . b ) ;
CasInput ( m . r , m . g , m . b ) ;
CasInput ( n . r , n . g , n . b ) ;
CasInput ( o . r , o . g , o . b ) ;
CasInput ( p . r , p . g , p . b ) ;
// Soft min and max.
// These are 2.0x bigger (factored out the extra multiply).
// a b c b
// e f g * 0.5 + e f g * 0.5 [F]
// i j k j
AF1 mnfR = AMin3F1 ( AMin3F1 ( b . r , e . r , f . r ) , g . r , j . r ) ;
AF1 mnfG = AMin3F1 ( AMin3F1 ( b . g , e . g , f . g ) , g . g , j . g ) ;
AF1 mnfB = AMin3F1 ( AMin3F1 ( b . b , e . b , f . b ) , g . b , j . b ) ;
# ifdef CAS_BETTER_DIAGONALS
AF1 mnfR2 = AMin3F1 ( AMin3F1 ( mnfR , a . r , c . r ) , i . r , k . r ) ;
AF1 mnfG2 = AMin3F1 ( AMin3F1 ( mnfG , a . g , c . g ) , i . g , k . g ) ;
AF1 mnfB2 = AMin3F1 ( AMin3F1 ( mnfB , a . b , c . b ) , i . b , k . b ) ;
mnfR = mnfR + mnfR2 ;
mnfG = mnfG + mnfG2 ;
mnfB = mnfB + mnfB2 ;
# endif
AF1 mxfR = AMax3F1 ( AMax3F1 ( b . r , e . r , f . r ) , g . r , j . r ) ;
AF1 mxfG = AMax3F1 ( AMax3F1 ( b . g , e . g , f . g ) , g . g , j . g ) ;
AF1 mxfB = AMax3F1 ( AMax3F1 ( b . b , e . b , f . b ) , g . b , j . b ) ;
# ifdef CAS_BETTER_DIAGONALS
AF1 mxfR2 = AMax3F1 ( AMax3F1 ( mxfR , a . r , c . r ) , i . r , k . r ) ;
AF1 mxfG2 = AMax3F1 ( AMax3F1 ( mxfG , a . g , c . g ) , i . g , k . g ) ;
AF1 mxfB2 = AMax3F1 ( AMax3F1 ( mxfB , a . b , c . b ) , i . b , k . b ) ;
mxfR = mxfR + mxfR2 ;
mxfG = mxfG + mxfG2 ;
mxfB = mxfB + mxfB2 ;
# endif
// b c d c
// f g h * 0.5 + f g h * 0.5 [G]
// j k l k
AF1 mngR = AMin3F1 ( AMin3F1 ( c . r , f . r , g . r ) , h . r , k . r ) ;
AF1 mngG = AMin3F1 ( AMin3F1 ( c . g , f . g , g . g ) , h . g , k . g ) ;
AF1 mngB = AMin3F1 ( AMin3F1 ( c . b , f . b , g . b ) , h . b , k . b ) ;
# ifdef CAS_BETTER_DIAGONALS
AF1 mngR2 = AMin3F1 ( AMin3F1 ( mngR , b . r , d . r ) , j . r , l . r ) ;
AF1 mngG2 = AMin3F1 ( AMin3F1 ( mngG , b . g , d . g ) , j . g , l . g ) ;
AF1 mngB2 = AMin3F1 ( AMin3F1 ( mngB , b . b , d . b ) , j . b , l . b ) ;
mngR = mngR + mngR2 ;
mngG = mngG + mngG2 ;
mngB = mngB + mngB2 ;
# endif
AF1 mxgR = AMax3F1 ( AMax3F1 ( c . r , f . r , g . r ) , h . r , k . r ) ;
AF1 mxgG = AMax3F1 ( AMax3F1 ( c . g , f . g , g . g ) , h . g , k . g ) ;
AF1 mxgB = AMax3F1 ( AMax3F1 ( c . b , f . b , g . b ) , h . b , k . b ) ;
# ifdef CAS_BETTER_DIAGONALS
AF1 mxgR2 = AMax3F1 ( AMax3F1 ( mxgR , b . r , d . r ) , j . r , l . r ) ;
AF1 mxgG2 = AMax3F1 ( AMax3F1 ( mxgG , b . g , d . g ) , j . g , l . g ) ;
AF1 mxgB2 = AMax3F1 ( AMax3F1 ( mxgB , b . b , d . b ) , j . b , l . b ) ;
mxgR = mxgR + mxgR2 ;
mxgG = mxgG + mxgG2 ;
mxgB = mxgB + mxgB2 ;
# endif
// e f g f
// i j k * 0.5 + i j k * 0.5 [J]
// m n o n
AF1 mnjR = AMin3F1 ( AMin3F1 ( f . r , i . r , j . r ) , k . r , n . r ) ;
AF1 mnjG = AMin3F1 ( AMin3F1 ( f . g , i . g , j . g ) , k . g , n . g ) ;
AF1 mnjB = AMin3F1 ( AMin3F1 ( f . b , i . b , j . b ) , k . b , n . b ) ;
# ifdef CAS_BETTER_DIAGONALS
AF1 mnjR2 = AMin3F1 ( AMin3F1 ( mnjR , e . r , g . r ) , m . r , o . r ) ;
AF1 mnjG2 = AMin3F1 ( AMin3F1 ( mnjG , e . g , g . g ) , m . g , o . g ) ;
AF1 mnjB2 = AMin3F1 ( AMin3F1 ( mnjB , e . b , g . b ) , m . b , o . b ) ;
mnjR = mnjR + mnjR2 ;
mnjG = mnjG + mnjG2 ;
mnjB = mnjB + mnjB2 ;
# endif
AF1 mxjR = AMax3F1 ( AMax3F1 ( f . r , i . r , j . r ) , k . r , n . r ) ;
AF1 mxjG = AMax3F1 ( AMax3F1 ( f . g , i . g , j . g ) , k . g , n . g ) ;
AF1 mxjB = AMax3F1 ( AMax3F1 ( f . b , i . b , j . b ) , k . b , n . b ) ;
# ifdef CAS_BETTER_DIAGONALS
AF1 mxjR2 = AMax3F1 ( AMax3F1 ( mxjR , e . r , g . r ) , m . r , o . r ) ;
AF1 mxjG2 = AMax3F1 ( AMax3F1 ( mxjG , e . g , g . g ) , m . g , o . g ) ;
AF1 mxjB2 = AMax3F1 ( AMax3F1 ( mxjB , e . b , g . b ) , m . b , o . b ) ;
mxjR = mxjR + mxjR2 ;
mxjG = mxjG + mxjG2 ;
mxjB = mxjB + mxjB2 ;
# endif
// f g h g
// j k l * 0.5 + j k l * 0.5 [K]
// n o p o
AF1 mnkR = AMin3F1 ( AMin3F1 ( g . r , j . r , k . r ) , l . r , o . r ) ;
AF1 mnkG = AMin3F1 ( AMin3F1 ( g . g , j . g , k . g ) , l . g , o . g ) ;
AF1 mnkB = AMin3F1 ( AMin3F1 ( g . b , j . b , k . b ) , l . b , o . b ) ;
# ifdef CAS_BETTER_DIAGONALS
AF1 mnkR2 = AMin3F1 ( AMin3F1 ( mnkR , f . r , h . r ) , n . r , p . r ) ;
AF1 mnkG2 = AMin3F1 ( AMin3F1 ( mnkG , f . g , h . g ) , n . g , p . g ) ;
AF1 mnkB2 = AMin3F1 ( AMin3F1 ( mnkB , f . b , h . b ) , n . b , p . b ) ;
mnkR = mnkR + mnkR2 ;
mnkG = mnkG + mnkG2 ;
mnkB = mnkB + mnkB2 ;
# endif
AF1 mxkR = AMax3F1 ( AMax3F1 ( g . r , j . r , k . r ) , l . r , o . r ) ;
AF1 mxkG = AMax3F1 ( AMax3F1 ( g . g , j . g , k . g ) , l . g , o . g ) ;
AF1 mxkB = AMax3F1 ( AMax3F1 ( g . b , j . b , k . b ) , l . b , o . b ) ;
# ifdef CAS_BETTER_DIAGONALS
AF1 mxkR2 = AMax3F1 ( AMax3F1 ( mxkR , f . r , h . r ) , n . r , p . r ) ;
AF1 mxkG2 = AMax3F1 ( AMax3F1 ( mxkG , f . g , h . g ) , n . g , p . g ) ;
AF1 mxkB2 = AMax3F1 ( AMax3F1 ( mxkB , f . b , h . b ) , n . b , p . b ) ;
mxkR = mxkR + mxkR2 ;
mxkG = mxkG + mxkG2 ;
mxkB = mxkB + mxkB2 ;
# endif
// Smooth minimum distance to signal limit divided by smooth max.
# ifdef CAS_GO_SLOWER
AF1 rcpMfR = ARcpF1 ( mxfR ) ;
AF1 rcpMfG = ARcpF1 ( mxfG ) ;
AF1 rcpMfB = ARcpF1 ( mxfB ) ;
AF1 rcpMgR = ARcpF1 ( mxgR ) ;
AF1 rcpMgG = ARcpF1 ( mxgG ) ;
AF1 rcpMgB = ARcpF1 ( mxgB ) ;
AF1 rcpMjR = ARcpF1 ( mxjR ) ;
AF1 rcpMjG = ARcpF1 ( mxjG ) ;
AF1 rcpMjB = ARcpF1 ( mxjB ) ;
AF1 rcpMkR = ARcpF1 ( mxkR ) ;
AF1 rcpMkG = ARcpF1 ( mxkG ) ;
AF1 rcpMkB = ARcpF1 ( mxkB ) ;
# else
AF1 rcpMfR = APrxLoRcpF1 ( mxfR ) ;
AF1 rcpMfG = APrxLoRcpF1 ( mxfG ) ;
AF1 rcpMfB = APrxLoRcpF1 ( mxfB ) ;
AF1 rcpMgR = APrxLoRcpF1 ( mxgR ) ;
AF1 rcpMgG = APrxLoRcpF1 ( mxgG ) ;
AF1 rcpMgB = APrxLoRcpF1 ( mxgB ) ;
AF1 rcpMjR = APrxLoRcpF1 ( mxjR ) ;
AF1 rcpMjG = APrxLoRcpF1 ( mxjG ) ;
AF1 rcpMjB = APrxLoRcpF1 ( mxjB ) ;
AF1 rcpMkR = APrxLoRcpF1 ( mxkR ) ;
AF1 rcpMkG = APrxLoRcpF1 ( mxkG ) ;
AF1 rcpMkB = APrxLoRcpF1 ( mxkB ) ;
# endif
# ifdef CAS_BETTER_DIAGONALS
AF1 ampfR = ASatF1 ( min ( mnfR , AF1_ ( 2.0 ) - mxfR ) * rcpMfR ) ;
AF1 ampfG = ASatF1 ( min ( mnfG , AF1_ ( 2.0 ) - mxfG ) * rcpMfG ) ;
AF1 ampfB = ASatF1 ( min ( mnfB , AF1_ ( 2.0 ) - mxfB ) * rcpMfB ) ;
AF1 ampgR = ASatF1 ( min ( mngR , AF1_ ( 2.0 ) - mxgR ) * rcpMgR ) ;
AF1 ampgG = ASatF1 ( min ( mngG , AF1_ ( 2.0 ) - mxgG ) * rcpMgG ) ;
AF1 ampgB = ASatF1 ( min ( mngB , AF1_ ( 2.0 ) - mxgB ) * rcpMgB ) ;
AF1 ampjR = ASatF1 ( min ( mnjR , AF1_ ( 2.0 ) - mxjR ) * rcpMjR ) ;
AF1 ampjG = ASatF1 ( min ( mnjG , AF1_ ( 2.0 ) - mxjG ) * rcpMjG ) ;
AF1 ampjB = ASatF1 ( min ( mnjB , AF1_ ( 2.0 ) - mxjB ) * rcpMjB ) ;
AF1 ampkR = ASatF1 ( min ( mnkR , AF1_ ( 2.0 ) - mxkR ) * rcpMkR ) ;
AF1 ampkG = ASatF1 ( min ( mnkG , AF1_ ( 2.0 ) - mxkG ) * rcpMkG ) ;
AF1 ampkB = ASatF1 ( min ( mnkB , AF1_ ( 2.0 ) - mxkB ) * rcpMkB ) ;
# else
AF1 ampfR = ASatF1 ( min ( mnfR , AF1_ ( 1.0 ) - mxfR ) * rcpMfR ) ;
AF1 ampfG = ASatF1 ( min ( mnfG , AF1_ ( 1.0 ) - mxfG ) * rcpMfG ) ;
AF1 ampfB = ASatF1 ( min ( mnfB , AF1_ ( 1.0 ) - mxfB ) * rcpMfB ) ;
AF1 ampgR = ASatF1 ( min ( mngR , AF1_ ( 1.0 ) - mxgR ) * rcpMgR ) ;
AF1 ampgG = ASatF1 ( min ( mngG , AF1_ ( 1.0 ) - mxgG ) * rcpMgG ) ;
AF1 ampgB = ASatF1 ( min ( mngB , AF1_ ( 1.0 ) - mxgB ) * rcpMgB ) ;
AF1 ampjR = ASatF1 ( min ( mnjR , AF1_ ( 1.0 ) - mxjR ) * rcpMjR ) ;
AF1 ampjG = ASatF1 ( min ( mnjG , AF1_ ( 1.0 ) - mxjG ) * rcpMjG ) ;
AF1 ampjB = ASatF1 ( min ( mnjB , AF1_ ( 1.0 ) - mxjB ) * rcpMjB ) ;
AF1 ampkR = ASatF1 ( min ( mnkR , AF1_ ( 1.0 ) - mxkR ) * rcpMkR ) ;
AF1 ampkG = ASatF1 ( min ( mnkG , AF1_ ( 1.0 ) - mxkG ) * rcpMkG ) ;
AF1 ampkB = ASatF1 ( min ( mnkB , AF1_ ( 1.0 ) - mxkB ) * rcpMkB ) ;
# endif
// Shaping amount of sharpening.
# ifdef CAS_GO_SLOWER
ampfR = sqrt ( ampfR ) ;
ampfG = sqrt ( ampfG ) ;
ampfB = sqrt ( ampfB ) ;
ampgR = sqrt ( ampgR ) ;
ampgG = sqrt ( ampgG ) ;
ampgB = sqrt ( ampgB ) ;
ampjR = sqrt ( ampjR ) ;
ampjG = sqrt ( ampjG ) ;
ampjB = sqrt ( ampjB ) ;
ampkR = sqrt ( ampkR ) ;
ampkG = sqrt ( ampkG ) ;
ampkB = sqrt ( ampkB ) ;
# else
ampfR = APrxLoSqrtF1 ( ampfR ) ;
ampfG = APrxLoSqrtF1 ( ampfG ) ;
ampfB = APrxLoSqrtF1 ( ampfB ) ;
ampgR = APrxLoSqrtF1 ( ampgR ) ;
ampgG = APrxLoSqrtF1 ( ampgG ) ;
ampgB = APrxLoSqrtF1 ( ampgB ) ;
ampjR = APrxLoSqrtF1 ( ampjR ) ;
ampjG = APrxLoSqrtF1 ( ampjG ) ;
ampjB = APrxLoSqrtF1 ( ampjB ) ;
ampkR = APrxLoSqrtF1 ( ampkR ) ;
ampkG = APrxLoSqrtF1 ( ampkG ) ;
ampkB = APrxLoSqrtF1 ( ampkB ) ;
# endif
// Filter shape.
// 0 w 0
// w 1 w
// 0 w 0
AF1 peak = AF1_AU1 ( const1 . x ) ;
AF1 wfR = ampfR * peak ;
AF1 wfG = ampfG * peak ;
AF1 wfB = ampfB * peak ;
AF1 wgR = ampgR * peak ;
AF1 wgG = ampgG * peak ;
AF1 wgB = ampgB * peak ;
AF1 wjR = ampjR * peak ;
AF1 wjG = ampjG * peak ;
AF1 wjB = ampjB * peak ;
AF1 wkR = ampkR * peak ;
AF1 wkG = ampkG * peak ;
AF1 wkB = ampkB * peak ;
// Blend between 4 results.
// s t
// u v
AF1 s = ( AF1_ ( 1.0 ) - pp . x ) * ( AF1_ ( 1.0 ) - pp . y ) ;
AF1 t = pp . x * ( AF1_ ( 1.0 ) - pp . y ) ;
AF1 u = ( AF1_ ( 1.0 ) - pp . x ) * pp . y ;
AF1 v = pp . x * pp . y ;
// Thin edges to hide bilinear interpolation (helps diagonals).
AF1 thinB = 1.0 / 32.0 ;
# ifdef CAS_GO_SLOWER
s * = ARcpF1 ( thinB + ( mxfG - mnfG ) ) ;
t * = ARcpF1 ( thinB + ( mxgG - mngG ) ) ;
u * = ARcpF1 ( thinB + ( mxjG - mnjG ) ) ;
v * = ARcpF1 ( thinB + ( mxkG - mnkG ) ) ;
# else
s * = APrxLoRcpF1 ( thinB + ( mxfG - mnfG ) ) ;
t * = APrxLoRcpF1 ( thinB + ( mxgG - mngG ) ) ;
u * = APrxLoRcpF1 ( thinB + ( mxjG - mnjG ) ) ;
v * = APrxLoRcpF1 ( thinB + ( mxkG - mnkG ) ) ;
# endif
// Final weighting.
// b c
// e f g h
// i j k l
// n o
// _____ _____ _____ _____
// fs gt
//
// _____ _____ _____ _____
// fs s gt fs t gt
// ju kv
// _____ _____ _____ _____
// fs gt
// ju u kv ju v kv
// _____ _____ _____ _____
//
// ju kv
2022-11-20 23:04:05 +00:00
A_MAYBE_UNUSED AF1 qbeR = wfR * s ;
A_MAYBE_UNUSED AF1 qbeG = wfG * s ;
A_MAYBE_UNUSED AF1 qbeB = wfB * s ;
A_MAYBE_UNUSED AF1 qchR = wgR * t ;
A_MAYBE_UNUSED AF1 qchG = wgG * t ;
A_MAYBE_UNUSED AF1 qchB = wgB * t ;
A_MAYBE_UNUSED AF1 qfR = wgR * t + wjR * u + s ;
A_MAYBE_UNUSED AF1 qfG = wgG * t + wjG * u + s ;
A_MAYBE_UNUSED AF1 qfB = wgB * t + wjB * u + s ;
A_MAYBE_UNUSED AF1 qgR = wfR * s + wkR * v + t ;
A_MAYBE_UNUSED AF1 qgG = wfG * s + wkG * v + t ;
A_MAYBE_UNUSED AF1 qgB = wfB * s + wkB * v + t ;
A_MAYBE_UNUSED AF1 qjR = wfR * s + wkR * v + u ;
A_MAYBE_UNUSED AF1 qjG = wfG * s + wkG * v + u ;
A_MAYBE_UNUSED AF1 qjB = wfB * s + wkB * v + u ;
A_MAYBE_UNUSED AF1 qkR = wgR * t + wjR * u + v ;
A_MAYBE_UNUSED AF1 qkG = wgG * t + wjG * u + v ;
A_MAYBE_UNUSED AF1 qkB = wgB * t + wjB * u + v ;
A_MAYBE_UNUSED AF1 qinR = wjR * u ;
A_MAYBE_UNUSED AF1 qinG = wjG * u ;
A_MAYBE_UNUSED AF1 qinB = wjB * u ;
A_MAYBE_UNUSED AF1 qloR = wkR * v ;
A_MAYBE_UNUSED AF1 qloG = wkG * v ;
A_MAYBE_UNUSED AF1 qloB = wkB * v ;
2022-11-20 14:21:20 +00:00
// Filter.
# ifndef CAS_SLOW
// Using green coef only, depending on dead code removal to strip out the extra overhead.
# ifdef CAS_GO_SLOWER
AF1 rcpWG = ARcpF1 ( AF1_ ( 2.0 ) * qbeG + AF1_ ( 2.0 ) * qchG + AF1_ ( 2.0 ) * qinG + AF1_ ( 2.0 ) * qloG + qfG + qgG + qjG + qkG ) ;
# else
AF1 rcpWG = APrxMedRcpF1 ( AF1_ ( 2.0 ) * qbeG + AF1_ ( 2.0 ) * qchG + AF1_ ( 2.0 ) * qinG + AF1_ ( 2.0 ) * qloG + qfG + qgG + qjG + qkG ) ;
# endif
pixR = ASatF1 ( ( b . r * qbeG + e . r * qbeG + c . r * qchG + h . r * qchG + i . r * qinG + n . r * qinG + l . r * qloG + o . r * qloG + f . r * qfG + g . r * qgG + j . r * qjG + k . r * qkG ) * rcpWG ) ;
pixG = ASatF1 ( ( b . g * qbeG + e . g * qbeG + c . g * qchG + h . g * qchG + i . g * qinG + n . g * qinG + l . g * qloG + o . g * qloG + f . g * qfG + g . g * qgG + j . g * qjG + k . g * qkG ) * rcpWG ) ;
pixB = ASatF1 ( ( b . b * qbeG + e . b * qbeG + c . b * qchG + h . b * qchG + i . b * qinG + n . b * qinG + l . b * qloG + o . b * qloG + f . b * qfG + g . b * qgG + j . b * qjG + k . b * qkG ) * rcpWG ) ;
# else
# ifdef CAS_GO_SLOWER
AF1 rcpWR = ARcpF1 ( AF1_ ( 2.0 ) * qbeR + AF1_ ( 2.0 ) * qchR + AF1_ ( 2.0 ) * qinR + AF1_ ( 2.0 ) * qloR + qfR + qgR + qjR + qkR ) ;
AF1 rcpWG = ARcpF1 ( AF1_ ( 2.0 ) * qbeG + AF1_ ( 2.0 ) * qchG + AF1_ ( 2.0 ) * qinG + AF1_ ( 2.0 ) * qloG + qfG + qgG + qjG + qkG ) ;
AF1 rcpWB = ARcpF1 ( AF1_ ( 2.0 ) * qbeB + AF1_ ( 2.0 ) * qchB + AF1_ ( 2.0 ) * qinB + AF1_ ( 2.0 ) * qloB + qfB + qgB + qjB + qkB ) ;
# else
AF1 rcpWR = APrxMedRcpF1 ( AF1_ ( 2.0 ) * qbeR + AF1_ ( 2.0 ) * qchR + AF1_ ( 2.0 ) * qinR + AF1_ ( 2.0 ) * qloR + qfR + qgR + qjR + qkR ) ;
AF1 rcpWG = APrxMedRcpF1 ( AF1_ ( 2.0 ) * qbeG + AF1_ ( 2.0 ) * qchG + AF1_ ( 2.0 ) * qinG + AF1_ ( 2.0 ) * qloG + qfG + qgG + qjG + qkG ) ;
AF1 rcpWB = APrxMedRcpF1 ( AF1_ ( 2.0 ) * qbeB + AF1_ ( 2.0 ) * qchB + AF1_ ( 2.0 ) * qinB + AF1_ ( 2.0 ) * qloB + qfB + qgB + qjB + qkB ) ;
# endif
pixR = ASatF1 ( ( b . r * qbeR + e . r * qbeR + c . r * qchR + h . r * qchR + i . r * qinR + n . r * qinR + l . r * qloR + o . r * qloR + f . r * qfR + g . r * qgR + j . r * qjR + k . r * qkR ) * rcpWR ) ;
pixG = ASatF1 ( ( b . g * qbeG + e . g * qbeG + c . g * qchG + h . g * qchG + i . g * qinG + n . g * qinG + l . g * qloG + o . g * qloG + f . g * qfG + g . g * qgG + j . g * qjG + k . g * qkG ) * rcpWG ) ;
pixB = ASatF1 ( ( b . b * qbeB + e . b * qbeB + c . b * qchB + h . b * qchB + i . b * qinB + n . b * qinB + l . b * qloB + o . b * qloB + f . b * qfB + g . b * qgB + j . b * qjB + k . b * qkB ) * rcpWB ) ;
# endif
}
2022-11-20 23:04:05 +00:00
# undef TEXINPUT
# undef TEXCALL
2022-11-20 14:21:20 +00:00
# endif
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//_____________________________________________________________/\_______________________________________________________________
//==============================================================================================================================
// PACKED VERSION
//==============================================================================================================================
# if defined(A_GPU) && defined(A_HALF)
2022-11-20 23:04:05 +00:00
# if defined(A_MSL) && !defined(CAS_TEXTUREH)
# define CAS_TEXTUREH texture2d<half>
# endif
# ifdef CAS_TEXTUREH
# define TEXCALL tex,
# define TEXINPUT CAS_TEXTUREH tex,
# else
# define TEXCALL
# define TEXINPUT
# endif
2022-11-20 14:21:20 +00:00
// Missing a way to do packed re-interpetation, so must disable approximation optimizations.
# ifdef A_HLSL
# ifndef CAS_GO_SLOWER
# define CAS_GO_SLOWER 1
# endif
# endif
//==============================================================================================================================
// Can be used to convert from packed SOA to AOS for store.
2022-11-20 23:04:05 +00:00
void CasDepack ( outAH4 pix0 , outAH4 pix1 , AH2 pixR , AH2 pixG , AH2 pixB ) {
2022-11-20 14:21:20 +00:00
# ifdef A_HLSL
// Invoke a slower path for DX only, since it won't allow uninitialized values.
pix0 . a = pix1 . a = 0.0 ;
# endif
pix0 . rgb = AH3 ( pixR . x , pixG . x , pixB . x ) ;
pix1 . rgb = AH3 ( pixR . y , pixG . y , pixB . y ) ; }
//==============================================================================================================================
void CasFilterH (
2022-11-20 23:04:05 +00:00
TEXINPUT
2022-11-20 14:21:20 +00:00
// Output values are for 2 8x8 tiles in a 16x8 region.
// pix<R,G,B>.x = right 8x8 tile
// pix<R,G,B>.y = left 8x8 tile
// This enables later processing to easily be packed as well.
2022-11-20 23:04:05 +00:00
outAH2 pixR ,
outAH2 pixG ,
outAH2 pixB ,
2022-11-20 14:21:20 +00:00
AU2 ip , // Integer pixel position in output.
AU4 const0 , // Constants generated by CasSetup().
AU4 const1 ,
AP1 noScaling ) { // Must be a compile-time literal value, true = sharpen only (no resize).
//------------------------------------------------------------------------------------------------------------------------------
// Debug a checker pattern of on/off tiles for visual inspection.
# ifdef CAS_DEBUG_CHECKER
if ( ( ( ( ip . x ^ ip . y ) > > 8u ) & 1u ) = = 0u ) { AH3 pix0 = CasLoadH ( ASW2 ( ip ) ) ; AH3 pix1 = CasLoadH ( ASW2 ( ip ) + ASW2 ( 8 , 0 ) ) ;
pixR = AH2 ( pix0 . r , pix1 . r ) ; pixG = AH2 ( pix0 . g , pix1 . g ) ; pixB = AH2 ( pix0 . b , pix1 . b ) ; CasInputH ( pixR , pixG , pixB ) ; return ; }
# endif
//------------------------------------------------------------------------------------------------------------------------------
// No scaling algorithm uses minimal 3x3 pixel neighborhood.
if ( noScaling ) {
ASW2 sp0 = ASW2 ( ip ) ;
2022-11-20 23:04:05 +00:00
AH3 a0 = CasLoadH ( TEXCALL sp0 + ASW2 ( - 1 , - 1 ) ) ;
AH3 b0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 0 , - 1 ) ) ;
AH3 c0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 1 , - 1 ) ) ;
AH3 d0 = CasLoadH ( TEXCALL sp0 + ASW2 ( - 1 , 0 ) ) ;
AH3 e0 = CasLoadH ( TEXCALL sp0 ) ;
AH3 f0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 1 , 0 ) ) ;
AH3 g0 = CasLoadH ( TEXCALL sp0 + ASW2 ( - 1 , 1 ) ) ;
AH3 h0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 0 , 1 ) ) ;
AH3 i0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 1 , 1 ) ) ;
2022-11-20 14:21:20 +00:00
ASW2 sp1 = sp0 + ASW2 ( 8 , 0 ) ;
2022-11-20 23:04:05 +00:00
AH3 a1 = CasLoadH ( TEXCALL sp1 + ASW2 ( - 1 , - 1 ) ) ;
AH3 b1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 0 , - 1 ) ) ;
AH3 c1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 1 , - 1 ) ) ;
AH3 d1 = CasLoadH ( TEXCALL sp1 + ASW2 ( - 1 , 0 ) ) ;
AH3 e1 = CasLoadH ( TEXCALL sp1 ) ;
AH3 f1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 1 , 0 ) ) ;
AH3 g1 = CasLoadH ( TEXCALL sp1 + ASW2 ( - 1 , 1 ) ) ;
AH3 h1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 0 , 1 ) ) ;
AH3 i1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 1 , 1 ) ) ;
2022-11-20 14:21:20 +00:00
// AOS to SOA conversion.
AH2 aR = AH2 ( a0 . r , a1 . r ) ;
AH2 aG = AH2 ( a0 . g , a1 . g ) ;
AH2 aB = AH2 ( a0 . b , a1 . b ) ;
AH2 bR = AH2 ( b0 . r , b1 . r ) ;
AH2 bG = AH2 ( b0 . g , b1 . g ) ;
AH2 bB = AH2 ( b0 . b , b1 . b ) ;
AH2 cR = AH2 ( c0 . r , c1 . r ) ;
AH2 cG = AH2 ( c0 . g , c1 . g ) ;
AH2 cB = AH2 ( c0 . b , c1 . b ) ;
AH2 dR = AH2 ( d0 . r , d1 . r ) ;
AH2 dG = AH2 ( d0 . g , d1 . g ) ;
AH2 dB = AH2 ( d0 . b , d1 . b ) ;
AH2 eR = AH2 ( e0 . r , e1 . r ) ;
AH2 eG = AH2 ( e0 . g , e1 . g ) ;
AH2 eB = AH2 ( e0 . b , e1 . b ) ;
AH2 fR = AH2 ( f0 . r , f1 . r ) ;
AH2 fG = AH2 ( f0 . g , f1 . g ) ;
AH2 fB = AH2 ( f0 . b , f1 . b ) ;
AH2 gR = AH2 ( g0 . r , g1 . r ) ;
AH2 gG = AH2 ( g0 . g , g1 . g ) ;
AH2 gB = AH2 ( g0 . b , g1 . b ) ;
AH2 hR = AH2 ( h0 . r , h1 . r ) ;
AH2 hG = AH2 ( h0 . g , h1 . g ) ;
AH2 hB = AH2 ( h0 . b , h1 . b ) ;
AH2 iR = AH2 ( i0 . r , i1 . r ) ;
AH2 iG = AH2 ( i0 . g , i1 . g ) ;
AH2 iB = AH2 ( i0 . b , i1 . b ) ;
// Run optional input transform.
CasInputH ( aR , aG , aB ) ;
CasInputH ( bR , bG , bB ) ;
CasInputH ( cR , cG , cB ) ;
CasInputH ( dR , dG , dB ) ;
CasInputH ( eR , eG , eB ) ;
CasInputH ( fR , fG , fB ) ;
CasInputH ( gR , gG , gB ) ;
CasInputH ( hR , hG , hB ) ;
CasInputH ( iR , iG , iB ) ;
// Soft min and max.
AH2 mnR = min ( min ( fR , hR ) , min ( min ( bR , dR ) , eR ) ) ;
AH2 mnG = min ( min ( fG , hG ) , min ( min ( bG , dG ) , eG ) ) ;
AH2 mnB = min ( min ( fB , hB ) , min ( min ( bB , dB ) , eB ) ) ;
# ifdef CAS_BETTER_DIAGONALS
AH2 mnR2 = min ( min ( gR , iR ) , min ( min ( aR , cR ) , mnR ) ) ;
AH2 mnG2 = min ( min ( gG , iG ) , min ( min ( aG , cG ) , mnG ) ) ;
AH2 mnB2 = min ( min ( gB , iB ) , min ( min ( aB , cB ) , mnB ) ) ;
mnR = mnR + mnR2 ;
mnG = mnG + mnG2 ;
mnB = mnB + mnB2 ;
# endif
AH2 mxR = max ( max ( fR , hR ) , max ( max ( bR , dR ) , eR ) ) ;
AH2 mxG = max ( max ( fG , hG ) , max ( max ( bG , dG ) , eG ) ) ;
AH2 mxB = max ( max ( fB , hB ) , max ( max ( bB , dB ) , eB ) ) ;
# ifdef CAS_BETTER_DIAGONALS
AH2 mxR2 = max ( max ( gR , iR ) , max ( max ( aR , cR ) , mxR ) ) ;
AH2 mxG2 = max ( max ( gG , iG ) , max ( max ( aG , cG ) , mxG ) ) ;
AH2 mxB2 = max ( max ( gB , iB ) , max ( max ( aB , cB ) , mxB ) ) ;
mxR = mxR + mxR2 ;
mxG = mxG + mxG2 ;
mxB = mxB + mxB2 ;
# endif
// Smooth minimum distance to signal limit divided by smooth max.
# ifdef CAS_GO_SLOWER
AH2 rcpMR = ARcpH2 ( mxR ) ;
AH2 rcpMG = ARcpH2 ( mxG ) ;
AH2 rcpMB = ARcpH2 ( mxB ) ;
# else
AH2 rcpMR = APrxLoRcpH2 ( mxR ) ;
AH2 rcpMG = APrxLoRcpH2 ( mxG ) ;
AH2 rcpMB = APrxLoRcpH2 ( mxB ) ;
# endif
# ifdef CAS_BETTER_DIAGONALS
AH2 ampR = ASatH2 ( min ( mnR , AH2_ ( 2.0 ) - mxR ) * rcpMR ) ;
AH2 ampG = ASatH2 ( min ( mnG , AH2_ ( 2.0 ) - mxG ) * rcpMG ) ;
AH2 ampB = ASatH2 ( min ( mnB , AH2_ ( 2.0 ) - mxB ) * rcpMB ) ;
# else
AH2 ampR = ASatH2 ( min ( mnR , AH2_ ( 1.0 ) - mxR ) * rcpMR ) ;
AH2 ampG = ASatH2 ( min ( mnG , AH2_ ( 1.0 ) - mxG ) * rcpMG ) ;
AH2 ampB = ASatH2 ( min ( mnB , AH2_ ( 1.0 ) - mxB ) * rcpMB ) ;
# endif
// Shaping amount of sharpening.
# ifdef CAS_GO_SLOWER
ampR = sqrt ( ampR ) ;
ampG = sqrt ( ampG ) ;
ampB = sqrt ( ampB ) ;
# else
ampR = APrxLoSqrtH2 ( ampR ) ;
ampG = APrxLoSqrtH2 ( ampG ) ;
ampB = APrxLoSqrtH2 ( ampB ) ;
# endif
// Filter shape.
2022-11-20 23:04:05 +00:00
A_MAYBE_UNUSED AH1 peak = AH2_AU1 ( const1 . y ) . x ;
A_MAYBE_UNUSED AH2 wR = ampR * AH2_ ( peak ) ;
A_MAYBE_UNUSED AH2 wG = ampG * AH2_ ( peak ) ;
A_MAYBE_UNUSED AH2 wB = ampB * AH2_ ( peak ) ;
2022-11-20 14:21:20 +00:00
// Filter.
# ifndef CAS_SLOW
# ifdef CAS_GO_SLOWER
AH2 rcpWeight = ARcpH2 ( AH2_ ( 1.0 ) + AH2_ ( 4.0 ) * wG ) ;
# else
AH2 rcpWeight = APrxMedRcpH2 ( AH2_ ( 1.0 ) + AH2_ ( 4.0 ) * wG ) ;
# endif
pixR = ASatH2 ( ( bR * wG + dR * wG + fR * wG + hR * wG + eR ) * rcpWeight ) ;
pixG = ASatH2 ( ( bG * wG + dG * wG + fG * wG + hG * wG + eG ) * rcpWeight ) ;
pixB = ASatH2 ( ( bB * wG + dB * wG + fB * wG + hB * wG + eB ) * rcpWeight ) ;
# else
# ifdef CAS_GO_SLOWER
AH2 rcpWeightR = ARcpH2 ( AH2_ ( 1.0 ) + AH2_ ( 4.0 ) * wR ) ;
AH2 rcpWeightG = ARcpH2 ( AH2_ ( 1.0 ) + AH2_ ( 4.0 ) * wG ) ;
AH2 rcpWeightB = ARcpH2 ( AH2_ ( 1.0 ) + AH2_ ( 4.0 ) * wB ) ;
# else
AH2 rcpWeightR = APrxMedRcpH2 ( AH2_ ( 1.0 ) + AH2_ ( 4.0 ) * wR ) ;
AH2 rcpWeightG = APrxMedRcpH2 ( AH2_ ( 1.0 ) + AH2_ ( 4.0 ) * wG ) ;
AH2 rcpWeightB = APrxMedRcpH2 ( AH2_ ( 1.0 ) + AH2_ ( 4.0 ) * wB ) ;
# endif
pixR = ASatH2 ( ( bR * wR + dR * wR + fR * wR + hR * wR + eR ) * rcpWeightR ) ;
pixG = ASatH2 ( ( bG * wG + dG * wG + fG * wG + hG * wG + eG ) * rcpWeightG ) ;
pixB = ASatH2 ( ( bB * wB + dB * wB + fB * wB + hB * wB + eB ) * rcpWeightB ) ;
# endif
return ; }
//------------------------------------------------------------------------------------------------------------------------------
// Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm.
AF2 pp = AF2 ( ip ) * AF2_AU2 ( const0 . xy ) + AF2_AU2 ( const0 . zw ) ;
// Tile 0.
// Fractional position is needed in high precision here.
AF2 fp0 = floor ( pp ) ;
AH2 ppX ;
ppX . x = AH1 ( pp . x - fp0 . x ) ;
AH1 ppY = AH1 ( pp . y - fp0 . y ) ;
ASW2 sp0 = ASW2 ( fp0 ) ;
2022-11-20 23:04:05 +00:00
AH3 a0 = CasLoadH ( TEXCALL sp0 + ASW2 ( - 1 , - 1 ) ) ;
AH3 b0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 0 , - 1 ) ) ;
AH3 e0 = CasLoadH ( TEXCALL sp0 + ASW2 ( - 1 , 0 ) ) ;
AH3 f0 = CasLoadH ( TEXCALL sp0 ) ;
AH3 c0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 1 , - 1 ) ) ;
AH3 d0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 2 , - 1 ) ) ;
AH3 g0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 1 , 0 ) ) ;
AH3 h0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 2 , 0 ) ) ;
AH3 i0 = CasLoadH ( TEXCALL sp0 + ASW2 ( - 1 , 1 ) ) ;
AH3 j0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 0 , 1 ) ) ;
AH3 m0 = CasLoadH ( TEXCALL sp0 + ASW2 ( - 1 , 2 ) ) ;
AH3 n0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 0 , 2 ) ) ;
AH3 k0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 1 , 1 ) ) ;
AH3 l0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 2 , 1 ) ) ;
AH3 o0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 1 , 2 ) ) ;
AH3 p0 = CasLoadH ( TEXCALL sp0 + ASW2 ( 2 , 2 ) ) ;
2022-11-20 14:21:20 +00:00
// Tile 1 (offset only in x).
AF1 pp1 = pp . x + AF1_AU1 ( const1 . z ) ;
AF1 fp1 = floor ( pp1 ) ;
ppX . y = AH1 ( pp1 - fp1 ) ;
ASW2 sp1 = ASW2 ( fp1 , sp0 . y ) ;
2022-11-20 23:04:05 +00:00
AH3 a1 = CasLoadH ( TEXCALL sp1 + ASW2 ( - 1 , - 1 ) ) ;
AH3 b1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 0 , - 1 ) ) ;
AH3 e1 = CasLoadH ( TEXCALL sp1 + ASW2 ( - 1 , 0 ) ) ;
AH3 f1 = CasLoadH ( TEXCALL sp1 ) ;
AH3 c1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 1 , - 1 ) ) ;
AH3 d1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 2 , - 1 ) ) ;
AH3 g1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 1 , 0 ) ) ;
AH3 h1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 2 , 0 ) ) ;
AH3 i1 = CasLoadH ( TEXCALL sp1 + ASW2 ( - 1 , 1 ) ) ;
AH3 j1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 0 , 1 ) ) ;
AH3 m1 = CasLoadH ( TEXCALL sp1 + ASW2 ( - 1 , 2 ) ) ;
AH3 n1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 0 , 2 ) ) ;
AH3 k1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 1 , 1 ) ) ;
AH3 l1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 2 , 1 ) ) ;
AH3 o1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 1 , 2 ) ) ;
AH3 p1 = CasLoadH ( TEXCALL sp1 + ASW2 ( 2 , 2 ) ) ;
2022-11-20 14:21:20 +00:00
// AOS to SOA conversion.
AH2 aR = AH2 ( a0 . r , a1 . r ) ;
AH2 aG = AH2 ( a0 . g , a1 . g ) ;
AH2 aB = AH2 ( a0 . b , a1 . b ) ;
AH2 bR = AH2 ( b0 . r , b1 . r ) ;
AH2 bG = AH2 ( b0 . g , b1 . g ) ;
AH2 bB = AH2 ( b0 . b , b1 . b ) ;
AH2 cR = AH2 ( c0 . r , c1 . r ) ;
AH2 cG = AH2 ( c0 . g , c1 . g ) ;
AH2 cB = AH2 ( c0 . b , c1 . b ) ;
AH2 dR = AH2 ( d0 . r , d1 . r ) ;
AH2 dG = AH2 ( d0 . g , d1 . g ) ;
AH2 dB = AH2 ( d0 . b , d1 . b ) ;
AH2 eR = AH2 ( e0 . r , e1 . r ) ;
AH2 eG = AH2 ( e0 . g , e1 . g ) ;
AH2 eB = AH2 ( e0 . b , e1 . b ) ;
AH2 fR = AH2 ( f0 . r , f1 . r ) ;
AH2 fG = AH2 ( f0 . g , f1 . g ) ;
AH2 fB = AH2 ( f0 . b , f1 . b ) ;
AH2 gR = AH2 ( g0 . r , g1 . r ) ;
AH2 gG = AH2 ( g0 . g , g1 . g ) ;
AH2 gB = AH2 ( g0 . b , g1 . b ) ;
AH2 hR = AH2 ( h0 . r , h1 . r ) ;
AH2 hG = AH2 ( h0 . g , h1 . g ) ;
AH2 hB = AH2 ( h0 . b , h1 . b ) ;
AH2 iR = AH2 ( i0 . r , i1 . r ) ;
AH2 iG = AH2 ( i0 . g , i1 . g ) ;
AH2 iB = AH2 ( i0 . b , i1 . b ) ;
AH2 jR = AH2 ( j0 . r , j1 . r ) ;
AH2 jG = AH2 ( j0 . g , j1 . g ) ;
AH2 jB = AH2 ( j0 . b , j1 . b ) ;
AH2 kR = AH2 ( k0 . r , k1 . r ) ;
AH2 kG = AH2 ( k0 . g , k1 . g ) ;
AH2 kB = AH2 ( k0 . b , k1 . b ) ;
AH2 lR = AH2 ( l0 . r , l1 . r ) ;
AH2 lG = AH2 ( l0 . g , l1 . g ) ;
AH2 lB = AH2 ( l0 . b , l1 . b ) ;
AH2 mR = AH2 ( m0 . r , m1 . r ) ;
AH2 mG = AH2 ( m0 . g , m1 . g ) ;
AH2 mB = AH2 ( m0 . b , m1 . b ) ;
AH2 nR = AH2 ( n0 . r , n1 . r ) ;
AH2 nG = AH2 ( n0 . g , n1 . g ) ;
AH2 nB = AH2 ( n0 . b , n1 . b ) ;
AH2 oR = AH2 ( o0 . r , o1 . r ) ;
AH2 oG = AH2 ( o0 . g , o1 . g ) ;
AH2 oB = AH2 ( o0 . b , o1 . b ) ;
AH2 pR = AH2 ( p0 . r , p1 . r ) ;
AH2 pG = AH2 ( p0 . g , p1 . g ) ;
AH2 pB = AH2 ( p0 . b , p1 . b ) ;
// Run optional input transform.
CasInputH ( aR , aG , aB ) ;
CasInputH ( bR , bG , bB ) ;
CasInputH ( cR , cG , cB ) ;
CasInputH ( dR , dG , dB ) ;
CasInputH ( eR , eG , eB ) ;
CasInputH ( fR , fG , fB ) ;
CasInputH ( gR , gG , gB ) ;
CasInputH ( hR , hG , hB ) ;
CasInputH ( iR , iG , iB ) ;
CasInputH ( jR , jG , jB ) ;
CasInputH ( kR , kG , kB ) ;
CasInputH ( lR , lG , lB ) ;
CasInputH ( mR , mG , mB ) ;
CasInputH ( nR , nG , nB ) ;
CasInputH ( oR , oG , oB ) ;
CasInputH ( pR , pG , pB ) ;
// Soft min and max.
// These are 2.0x bigger (factored out the extra multiply).
// a b c b
// e f g * 0.5 + e f g * 0.5 [F]
// i j k j
AH2 mnfR = AMin3H2 ( AMin3H2 ( bR , eR , fR ) , gR , jR ) ;
AH2 mnfG = AMin3H2 ( AMin3H2 ( bG , eG , fG ) , gG , jG ) ;
AH2 mnfB = AMin3H2 ( AMin3H2 ( bB , eB , fB ) , gB , jB ) ;
# ifdef CAS_BETTER_DIAGONALS
AH2 mnfR2 = AMin3H2 ( AMin3H2 ( mnfR , aR , cR ) , iR , kR ) ;
AH2 mnfG2 = AMin3H2 ( AMin3H2 ( mnfG , aG , cG ) , iG , kG ) ;
AH2 mnfB2 = AMin3H2 ( AMin3H2 ( mnfB , aB , cB ) , iB , kB ) ;
mnfR = mnfR + mnfR2 ;
mnfG = mnfG + mnfG2 ;
mnfB = mnfB + mnfB2 ;
# endif
AH2 mxfR = AMax3H2 ( AMax3H2 ( bR , eR , fR ) , gR , jR ) ;
AH2 mxfG = AMax3H2 ( AMax3H2 ( bG , eG , fG ) , gG , jG ) ;
AH2 mxfB = AMax3H2 ( AMax3H2 ( bB , eB , fB ) , gB , jB ) ;
# ifdef CAS_BETTER_DIAGONALS
AH2 mxfR2 = AMax3H2 ( AMax3H2 ( mxfR , aR , cR ) , iR , kR ) ;
AH2 mxfG2 = AMax3H2 ( AMax3H2 ( mxfG , aG , cG ) , iG , kG ) ;
AH2 mxfB2 = AMax3H2 ( AMax3H2 ( mxfB , aB , cB ) , iB , kB ) ;
mxfR = mxfR + mxfR2 ;
mxfG = mxfG + mxfG2 ;
mxfB = mxfB + mxfB2 ;
# endif
// b c d c
// f g h * 0.5 + f g h * 0.5 [G]
// j k l k
AH2 mngR = AMin3H2 ( AMin3H2 ( cR , fR , gR ) , hR , kR ) ;
AH2 mngG = AMin3H2 ( AMin3H2 ( cG , fG , gG ) , hG , kG ) ;
AH2 mngB = AMin3H2 ( AMin3H2 ( cB , fB , gB ) , hB , kB ) ;
# ifdef CAS_BETTER_DIAGONALS
AH2 mngR2 = AMin3H2 ( AMin3H2 ( mngR , bR , dR ) , jR , lR ) ;
AH2 mngG2 = AMin3H2 ( AMin3H2 ( mngG , bG , dG ) , jG , lG ) ;
AH2 mngB2 = AMin3H2 ( AMin3H2 ( mngB , bB , dB ) , jB , lB ) ;
mngR = mngR + mngR2 ;
mngG = mngG + mngG2 ;
mngB = mngB + mngB2 ;
# endif
AH2 mxgR = AMax3H2 ( AMax3H2 ( cR , fR , gR ) , hR , kR ) ;
AH2 mxgG = AMax3H2 ( AMax3H2 ( cG , fG , gG ) , hG , kG ) ;
AH2 mxgB = AMax3H2 ( AMax3H2 ( cB , fB , gB ) , hB , kB ) ;
# ifdef CAS_BETTER_DIAGONALS
AH2 mxgR2 = AMax3H2 ( AMax3H2 ( mxgR , bR , dR ) , jR , lR ) ;
AH2 mxgG2 = AMax3H2 ( AMax3H2 ( mxgG , bG , dG ) , jG , lG ) ;
AH2 mxgB2 = AMax3H2 ( AMax3H2 ( mxgB , bB , dB ) , jB , lB ) ;
mxgR = mxgR + mxgR2 ;
mxgG = mxgG + mxgG2 ;
mxgB = mxgB + mxgB2 ;
# endif
// e f g f
// i j k * 0.5 + i j k * 0.5 [J]
// m n o n
AH2 mnjR = AMin3H2 ( AMin3H2 ( fR , iR , jR ) , kR , nR ) ;
AH2 mnjG = AMin3H2 ( AMin3H2 ( fG , iG , jG ) , kG , nG ) ;
AH2 mnjB = AMin3H2 ( AMin3H2 ( fB , iB , jB ) , kB , nB ) ;
# ifdef CAS_BETTER_DIAGONALS
AH2 mnjR2 = AMin3H2 ( AMin3H2 ( mnjR , eR , gR ) , mR , oR ) ;
AH2 mnjG2 = AMin3H2 ( AMin3H2 ( mnjG , eG , gG ) , mG , oG ) ;
AH2 mnjB2 = AMin3H2 ( AMin3H2 ( mnjB , eB , gB ) , mB , oB ) ;
mnjR = mnjR + mnjR2 ;
mnjG = mnjG + mnjG2 ;
mnjB = mnjB + mnjB2 ;
# endif
AH2 mxjR = AMax3H2 ( AMax3H2 ( fR , iR , jR ) , kR , nR ) ;
AH2 mxjG = AMax3H2 ( AMax3H2 ( fG , iG , jG ) , kG , nG ) ;
AH2 mxjB = AMax3H2 ( AMax3H2 ( fB , iB , jB ) , kB , nB ) ;
# ifdef CAS_BETTER_DIAGONALS
AH2 mxjR2 = AMax3H2 ( AMax3H2 ( mxjR , eR , gR ) , mR , oR ) ;
AH2 mxjG2 = AMax3H2 ( AMax3H2 ( mxjG , eG , gG ) , mG , oG ) ;
AH2 mxjB2 = AMax3H2 ( AMax3H2 ( mxjB , eB , gB ) , mB , oB ) ;
mxjR = mxjR + mxjR2 ;
mxjG = mxjG + mxjG2 ;
mxjB = mxjB + mxjB2 ;
# endif
// f g h g
// j k l * 0.5 + j k l * 0.5 [K]
// n o p o
AH2 mnkR = AMin3H2 ( AMin3H2 ( gR , jR , kR ) , lR , oR ) ;
AH2 mnkG = AMin3H2 ( AMin3H2 ( gG , jG , kG ) , lG , oG ) ;
AH2 mnkB = AMin3H2 ( AMin3H2 ( gB , jB , kB ) , lB , oB ) ;
# ifdef CAS_BETTER_DIAGONALS
AH2 mnkR2 = AMin3H2 ( AMin3H2 ( mnkR , fR , hR ) , nR , pR ) ;
AH2 mnkG2 = AMin3H2 ( AMin3H2 ( mnkG , fG , hG ) , nG , pG ) ;
AH2 mnkB2 = AMin3H2 ( AMin3H2 ( mnkB , fB , hB ) , nB , pB ) ;
mnkR = mnkR + mnkR2 ;
mnkG = mnkG + mnkG2 ;
mnkB = mnkB + mnkB2 ;
# endif
AH2 mxkR = AMax3H2 ( AMax3H2 ( gR , jR , kR ) , lR , oR ) ;
AH2 mxkG = AMax3H2 ( AMax3H2 ( gG , jG , kG ) , lG , oG ) ;
AH2 mxkB = AMax3H2 ( AMax3H2 ( gB , jB , kB ) , lB , oB ) ;
# ifdef CAS_BETTER_DIAGONALS
AH2 mxkR2 = AMax3H2 ( AMax3H2 ( mxkR , fR , hR ) , nR , pR ) ;
AH2 mxkG2 = AMax3H2 ( AMax3H2 ( mxkG , fG , hG ) , nG , pG ) ;
AH2 mxkB2 = AMax3H2 ( AMax3H2 ( mxkB , fB , hB ) , nB , pB ) ;
mxkR = mxkR + mxkR2 ;
mxkG = mxkG + mxkG2 ;
mxkB = mxkB + mxkB2 ;
# endif
// Smooth minimum distance to signal limit divided by smooth max.
# ifdef CAS_GO_SLOWER
AH2 rcpMfR = ARcpH2 ( mxfR ) ;
AH2 rcpMfG = ARcpH2 ( mxfG ) ;
AH2 rcpMfB = ARcpH2 ( mxfB ) ;
AH2 rcpMgR = ARcpH2 ( mxgR ) ;
AH2 rcpMgG = ARcpH2 ( mxgG ) ;
AH2 rcpMgB = ARcpH2 ( mxgB ) ;
AH2 rcpMjR = ARcpH2 ( mxjR ) ;
AH2 rcpMjG = ARcpH2 ( mxjG ) ;
AH2 rcpMjB = ARcpH2 ( mxjB ) ;
AH2 rcpMkR = ARcpH2 ( mxkR ) ;
AH2 rcpMkG = ARcpH2 ( mxkG ) ;
AH2 rcpMkB = ARcpH2 ( mxkB ) ;
# else
AH2 rcpMfR = APrxLoRcpH2 ( mxfR ) ;
AH2 rcpMfG = APrxLoRcpH2 ( mxfG ) ;
AH2 rcpMfB = APrxLoRcpH2 ( mxfB ) ;
AH2 rcpMgR = APrxLoRcpH2 ( mxgR ) ;
AH2 rcpMgG = APrxLoRcpH2 ( mxgG ) ;
AH2 rcpMgB = APrxLoRcpH2 ( mxgB ) ;
AH2 rcpMjR = APrxLoRcpH2 ( mxjR ) ;
AH2 rcpMjG = APrxLoRcpH2 ( mxjG ) ;
AH2 rcpMjB = APrxLoRcpH2 ( mxjB ) ;
AH2 rcpMkR = APrxLoRcpH2 ( mxkR ) ;
AH2 rcpMkG = APrxLoRcpH2 ( mxkG ) ;
AH2 rcpMkB = APrxLoRcpH2 ( mxkB ) ;
# endif
# ifdef CAS_BETTER_DIAGONALS
AH2 ampfR = ASatH2 ( min ( mnfR , AH2_ ( 2.0 ) - mxfR ) * rcpMfR ) ;
AH2 ampfG = ASatH2 ( min ( mnfG , AH2_ ( 2.0 ) - mxfG ) * rcpMfG ) ;
AH2 ampfB = ASatH2 ( min ( mnfB , AH2_ ( 2.0 ) - mxfB ) * rcpMfB ) ;
AH2 ampgR = ASatH2 ( min ( mngR , AH2_ ( 2.0 ) - mxgR ) * rcpMgR ) ;
AH2 ampgG = ASatH2 ( min ( mngG , AH2_ ( 2.0 ) - mxgG ) * rcpMgG ) ;
AH2 ampgB = ASatH2 ( min ( mngB , AH2_ ( 2.0 ) - mxgB ) * rcpMgB ) ;
AH2 ampjR = ASatH2 ( min ( mnjR , AH2_ ( 2.0 ) - mxjR ) * rcpMjR ) ;
AH2 ampjG = ASatH2 ( min ( mnjG , AH2_ ( 2.0 ) - mxjG ) * rcpMjG ) ;
AH2 ampjB = ASatH2 ( min ( mnjB , AH2_ ( 2.0 ) - mxjB ) * rcpMjB ) ;
AH2 ampkR = ASatH2 ( min ( mnkR , AH2_ ( 2.0 ) - mxkR ) * rcpMkR ) ;
AH2 ampkG = ASatH2 ( min ( mnkG , AH2_ ( 2.0 ) - mxkG ) * rcpMkG ) ;
AH2 ampkB = ASatH2 ( min ( mnkB , AH2_ ( 2.0 ) - mxkB ) * rcpMkB ) ;
# else
AH2 ampfR = ASatH2 ( min ( mnfR , AH2_ ( 1.0 ) - mxfR ) * rcpMfR ) ;
AH2 ampfG = ASatH2 ( min ( mnfG , AH2_ ( 1.0 ) - mxfG ) * rcpMfG ) ;
AH2 ampfB = ASatH2 ( min ( mnfB , AH2_ ( 1.0 ) - mxfB ) * rcpMfB ) ;
AH2 ampgR = ASatH2 ( min ( mngR , AH2_ ( 1.0 ) - mxgR ) * rcpMgR ) ;
AH2 ampgG = ASatH2 ( min ( mngG , AH2_ ( 1.0 ) - mxgG ) * rcpMgG ) ;
AH2 ampgB = ASatH2 ( min ( mngB , AH2_ ( 1.0 ) - mxgB ) * rcpMgB ) ;
AH2 ampjR = ASatH2 ( min ( mnjR , AH2_ ( 1.0 ) - mxjR ) * rcpMjR ) ;
AH2 ampjG = ASatH2 ( min ( mnjG , AH2_ ( 1.0 ) - mxjG ) * rcpMjG ) ;
AH2 ampjB = ASatH2 ( min ( mnjB , AH2_ ( 1.0 ) - mxjB ) * rcpMjB ) ;
AH2 ampkR = ASatH2 ( min ( mnkR , AH2_ ( 1.0 ) - mxkR ) * rcpMkR ) ;
AH2 ampkG = ASatH2 ( min ( mnkG , AH2_ ( 1.0 ) - mxkG ) * rcpMkG ) ;
AH2 ampkB = ASatH2 ( min ( mnkB , AH2_ ( 1.0 ) - mxkB ) * rcpMkB ) ;
# endif
// Shaping amount of sharpening.
# ifdef CAS_GO_SLOWER
ampfR = sqrt ( ampfR ) ;
ampfG = sqrt ( ampfG ) ;
ampfB = sqrt ( ampfB ) ;
ampgR = sqrt ( ampgR ) ;
ampgG = sqrt ( ampgG ) ;
ampgB = sqrt ( ampgB ) ;
ampjR = sqrt ( ampjR ) ;
ampjG = sqrt ( ampjG ) ;
ampjB = sqrt ( ampjB ) ;
ampkR = sqrt ( ampkR ) ;
ampkG = sqrt ( ampkG ) ;
ampkB = sqrt ( ampkB ) ;
# else
ampfR = APrxLoSqrtH2 ( ampfR ) ;
ampfG = APrxLoSqrtH2 ( ampfG ) ;
ampfB = APrxLoSqrtH2 ( ampfB ) ;
ampgR = APrxLoSqrtH2 ( ampgR ) ;
ampgG = APrxLoSqrtH2 ( ampgG ) ;
ampgB = APrxLoSqrtH2 ( ampgB ) ;
ampjR = APrxLoSqrtH2 ( ampjR ) ;
ampjG = APrxLoSqrtH2 ( ampjG ) ;
ampjB = APrxLoSqrtH2 ( ampjB ) ;
ampkR = APrxLoSqrtH2 ( ampkR ) ;
ampkG = APrxLoSqrtH2 ( ampkG ) ;
ampkB = APrxLoSqrtH2 ( ampkB ) ;
# endif
// Filter shape.
AH1 peak = AH2_AU1 ( const1 . y ) . x ;
AH2 wfR = ampfR * AH2_ ( peak ) ;
AH2 wfG = ampfG * AH2_ ( peak ) ;
AH2 wfB = ampfB * AH2_ ( peak ) ;
AH2 wgR = ampgR * AH2_ ( peak ) ;
AH2 wgG = ampgG * AH2_ ( peak ) ;
AH2 wgB = ampgB * AH2_ ( peak ) ;
AH2 wjR = ampjR * AH2_ ( peak ) ;
AH2 wjG = ampjG * AH2_ ( peak ) ;
AH2 wjB = ampjB * AH2_ ( peak ) ;
AH2 wkR = ampkR * AH2_ ( peak ) ;
AH2 wkG = ampkG * AH2_ ( peak ) ;
AH2 wkB = ampkB * AH2_ ( peak ) ;
// Blend between 4 results.
AH2 s = ( AH2_ ( 1.0 ) - ppX ) * ( AH2_ ( 1.0 ) - AH2_ ( ppY ) ) ;
AH2 t = ppX * ( AH2_ ( 1.0 ) - AH2_ ( ppY ) ) ;
AH2 u = ( AH2_ ( 1.0 ) - ppX ) * AH2_ ( ppY ) ;
AH2 v = ppX * AH2_ ( ppY ) ;
// Thin edges to hide bilinear interpolation (helps diagonals).
AH2 thinB = AH2_ ( 1.0 / 32.0 ) ;
# ifdef CAS_GO_SLOWER
s * = ARcpH2 ( thinB + ( mxfG - mnfG ) ) ;
t * = ARcpH2 ( thinB + ( mxgG - mngG ) ) ;
u * = ARcpH2 ( thinB + ( mxjG - mnjG ) ) ;
v * = ARcpH2 ( thinB + ( mxkG - mnkG ) ) ;
# else
s * = APrxLoRcpH2 ( thinB + ( mxfG - mnfG ) ) ;
t * = APrxLoRcpH2 ( thinB + ( mxgG - mngG ) ) ;
u * = APrxLoRcpH2 ( thinB + ( mxjG - mnjG ) ) ;
v * = APrxLoRcpH2 ( thinB + ( mxkG - mnkG ) ) ;
# endif
// Final weighting.
2022-11-20 23:04:05 +00:00
A_MAYBE_UNUSED AH2 qbeR = wfR * s ;
A_MAYBE_UNUSED AH2 qbeG = wfG * s ;
A_MAYBE_UNUSED AH2 qbeB = wfB * s ;
A_MAYBE_UNUSED AH2 qchR = wgR * t ;
A_MAYBE_UNUSED AH2 qchG = wgG * t ;
A_MAYBE_UNUSED AH2 qchB = wgB * t ;
A_MAYBE_UNUSED AH2 qfR = wgR * t + wjR * u + s ;
A_MAYBE_UNUSED AH2 qfG = wgG * t + wjG * u + s ;
A_MAYBE_UNUSED AH2 qfB = wgB * t + wjB * u + s ;
A_MAYBE_UNUSED AH2 qgR = wfR * s + wkR * v + t ;
A_MAYBE_UNUSED AH2 qgG = wfG * s + wkG * v + t ;
A_MAYBE_UNUSED AH2 qgB = wfB * s + wkB * v + t ;
A_MAYBE_UNUSED AH2 qjR = wfR * s + wkR * v + u ;
A_MAYBE_UNUSED AH2 qjG = wfG * s + wkG * v + u ;
A_MAYBE_UNUSED AH2 qjB = wfB * s + wkB * v + u ;
A_MAYBE_UNUSED AH2 qkR = wgR * t + wjR * u + v ;
A_MAYBE_UNUSED AH2 qkG = wgG * t + wjG * u + v ;
A_MAYBE_UNUSED AH2 qkB = wgB * t + wjB * u + v ;
A_MAYBE_UNUSED AH2 qinR = wjR * u ;
A_MAYBE_UNUSED AH2 qinG = wjG * u ;
A_MAYBE_UNUSED AH2 qinB = wjB * u ;
A_MAYBE_UNUSED AH2 qloR = wkR * v ;
A_MAYBE_UNUSED AH2 qloG = wkG * v ;
A_MAYBE_UNUSED AH2 qloB = wkB * v ;
2022-11-20 14:21:20 +00:00
// Filter.
# ifndef CAS_SLOW
# ifdef CAS_GO_SLOWER
AH2 rcpWG = ARcpH2 ( AH2_ ( 2.0 ) * qbeG + AH2_ ( 2.0 ) * qchG + AH2_ ( 2.0 ) * qinG + AH2_ ( 2.0 ) * qloG + qfG + qgG + qjG + qkG ) ;
# else
AH2 rcpWG = APrxMedRcpH2 ( AH2_ ( 2.0 ) * qbeG + AH2_ ( 2.0 ) * qchG + AH2_ ( 2.0 ) * qinG + AH2_ ( 2.0 ) * qloG + qfG + qgG + qjG + qkG ) ;
# endif
pixR = ASatH2 ( ( bR * qbeG + eR * qbeG + cR * qchG + hR * qchG + iR * qinG + nR * qinG + lR * qloG + oR * qloG + fR * qfG + gR * qgG + jR * qjG + kR * qkG ) * rcpWG ) ;
pixG = ASatH2 ( ( bG * qbeG + eG * qbeG + cG * qchG + hG * qchG + iG * qinG + nG * qinG + lG * qloG + oG * qloG + fG * qfG + gG * qgG + jG * qjG + kG * qkG ) * rcpWG ) ;
pixB = ASatH2 ( ( bB * qbeG + eB * qbeG + cB * qchG + hB * qchG + iB * qinG + nB * qinG + lB * qloG + oB * qloG + fB * qfG + gB * qgG + jB * qjG + kB * qkG ) * rcpWG ) ;
# else
# ifdef CAS_GO_SLOWER
AH2 rcpWR = ARcpH2 ( AH2_ ( 2.0 ) * qbeR + AH2_ ( 2.0 ) * qchR + AH2_ ( 2.0 ) * qinR + AH2_ ( 2.0 ) * qloR + qfR + qgR + qjR + qkR ) ;
AH2 rcpWG = ARcpH2 ( AH2_ ( 2.0 ) * qbeG + AH2_ ( 2.0 ) * qchG + AH2_ ( 2.0 ) * qinG + AH2_ ( 2.0 ) * qloG + qfG + qgG + qjG + qkG ) ;
AH2 rcpWB = ARcpH2 ( AH2_ ( 2.0 ) * qbeB + AH2_ ( 2.0 ) * qchB + AH2_ ( 2.0 ) * qinB + AH2_ ( 2.0 ) * qloB + qfB + qgB + qjB + qkB ) ;
# else
AH2 rcpWR = APrxMedRcpH2 ( AH2_ ( 2.0 ) * qbeR + AH2_ ( 2.0 ) * qchR + AH2_ ( 2.0 ) * qinR + AH2_ ( 2.0 ) * qloR + qfR + qgR + qjR + qkR ) ;
AH2 rcpWG = APrxMedRcpH2 ( AH2_ ( 2.0 ) * qbeG + AH2_ ( 2.0 ) * qchG + AH2_ ( 2.0 ) * qinG + AH2_ ( 2.0 ) * qloG + qfG + qgG + qjG + qkG ) ;
AH2 rcpWB = APrxMedRcpH2 ( AH2_ ( 2.0 ) * qbeB + AH2_ ( 2.0 ) * qchB + AH2_ ( 2.0 ) * qinB + AH2_ ( 2.0 ) * qloB + qfB + qgB + qjB + qkB ) ;
# endif
pixR = ASatH2 ( ( bR * qbeR + eR * qbeR + cR * qchR + hR * qchR + iR * qinR + nR * qinR + lR * qloR + oR * qloR + fR * qfR + gR * qgR + jR * qjR + kR * qkR ) * rcpWR ) ;
pixG = ASatH2 ( ( bG * qbeG + eG * qbeG + cG * qchG + hG * qchG + iG * qinG + nG * qinG + lG * qloG + oG * qloG + fG * qfG + gG * qgG + jG * qjG + kG * qkG ) * rcpWG ) ;
pixB = ASatH2 ( ( bB * qbeB + eB * qbeB + cB * qchB + hB * qchB + iB * qinB + nB * qinB + lB * qloB + oB * qloB + fB * qfB + gB * qgB + jB * qjB + kB * qkB ) * rcpWB ) ;
# endif
}
2022-11-20 23:04:05 +00:00
# undef TEXINPUT
# undef TEXCALL
2022-11-20 14:21:20 +00:00
# endif