//_____________________________________________________________/\_______________________________________________________________ //============================================================================================================================== // // [CAS] FIDELITY FX - CONSTRAST ADAPTIVE SHARPENING 1.20190610 // //============================================================================================================================== // LICENSE // ======= // Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved. // ------- // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // ------- // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the // Software. // ------- // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. //------------------------------------------------------------------------------------------------------------------------------ // ABOUT // ===== // CAS is a spatial only filter. // CAS takes RGB color input. // CAS enchances sharpness and local high-frequency contrast, and with or without added upsampling. // CAS outputs RGB color. //------------------------------------------------------------------------------------------------------------------------------ // SUGGESTIONS FOR INTEGRATION // =========================== // Best for performance, run CAS in sharpen-only mode, choose a video mode to have scan-out or the display scale. // - Sharpen-only mode is faster, and provides a better quality sharpening. // The scaling support in CAS was designed for when the application wants to do Dynamic Resolution Scaling (DRS). // - With DRS, the render resolution can change per frame. // - Use CAS to sharpen and upsample to the fixed output resolution, then composite the full resolution UI over CAS output. // - This can all happen in one compute dispatch. // It is likely better to reduce the amount of film grain which happens before CAS (as CAS will amplify grain). // - An alternative would be to add grain after CAS. // It is best to run CAS after tonemapping. // - CAS needs to have input value 1.0 at the peak of the display output. // It is ok to run CAS after compositing UI (it won't harm the UI). //------------------------------------------------------------------------------------------------------------------------------ // EXECUTION // ========= // CAS runs as a compute shader. // CAS is designed to be run either in a 32-bit, CasFilter(), or packed 16-bit, CasFilterH(), form. // The 32-bit form works on 8x8 tiles via one {64,1,1} workgroup. // The 16-bit form works on a pair of 8x8 tiles in a 16x8 configuration via one {64,1,1} workgroup. // CAS is designed to work best in semi-persistent form if running not async with graphics. // For 32-bit this means looping across a collection of 4 8x8 tiles in a 2x2 tile foot-print. // For 16-bit this means looping 2 times, once for the top 16x8 region and once for the bottom 16x8 region. //------------------------------------------------------------------------------------------------------------------------------ // INTEGRATION SUMMARY FOR CPU // =========================== // // Make sure has already been included. // // Setup pre-portability-header defines. // #define A_CPU 1 // // Include the portability header (requires version 1.20190530 or later which is backwards compatible). // #include "ffx_a.h" // // Include the CAS header. // #include "ffx_cas.h" // ... // // Call the setup function to build out the constants for the shader, pass these to the shader. // // The 'varAU4(const0);' expands into 'uint32_t const0[4];' on the CPU. // varAU4(const0); // varAU4(const1); // CasSetup(const0,const1, // 0.0f, // Sharpness tuning knob (0.0 to 1.0). // 1920.0f,1080.0f, // Example input size. // 2560.0f,1440.0f); // Example output size. // ... // // Later dispatch the shader based on the amount of semi-persistent loop unrolling. // // Here is an example for running with the 16x16 (4-way unroll for 32-bit or 2-way unroll for 16-bit) // vkCmdDispatch(cmdBuf,(widthInPixels+15)>>4,(heightInPixels+15)>>4,1); //------------------------------------------------------------------------------------------------------------------------------ // INTEGRATION SUMMARY FOR GPU // =========================== // // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT. // layout(set=0,binding=0,rgba16f)uniform image2D imgSrc; // layout(set=0,binding=1,rgba16f)uniform image2D imgDst; // ... // // Setup pre-portability-header defines (sets up GLSL/HLSL path, packed math support, etc) // #define A_GPU 1 // #define A_GLSL 1 // #define A_HALF 1 // ... // // Include the portability header (or copy it in without an include). // #include "ffx_a.h" // ... // // Define the fetch function(s). // // CasLoad() takes a 32-bit unsigned integer 2D coordinate and loads color. // AF3 CasLoad(ASU2 p){return imageLoad(imgSrc,p).rgb;} // // CasLoadH() is the 16-bit version taking 16-bit unsigned integer 2D coordinate and loading 16-bit float color. // // The ASU2() typecast back to 32-bit is a NO-OP, the compiler pattern matches and uses A16 opcode support instead. // // The AH3() typecast to 16-bit float is a NO-OP, the compiler pattern matches and uses D16 opcode support instead. // AH3 CasLoadH(ASW2 p){return AH3(imageLoad(imgSrc,ASU2(p)).rgb);} // // If you define CAS_TEXTURE and/or CAS_TEXTUREH to a type, a value of that type will be added as the first input to CasFilter and forwarded to CasLoad // // This is useful for forwarding extra data to the load functions, and is required by MSL, which doesn't use global textures // ... // // Define the input modifiers as nop's initially. // // See "INPUT FORMAT SPECIFIC CASES" below for specifics on what to place in these functions. // void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){} // void CasInputH(inout AH2 r,inout AH2 g,inout AH2 b){} // ... // // Include this CAS header file (or copy it in without an include). // #include "ffx_cas.h" // ... // // Example in shader integration for loop-unrolled 16x16 case for 32-bit. // layout(local_size_x=64)in; // void main(){ // // Fetch constants from CasSetup(). // AU4 const0=cb.const0; // AU4 const1=cb.const1; // // Do remapping of local xy in workgroup for a more PS-like swizzle pattern. // AU2 gxy=ARmp8x8(gl_LocalInvocationID.x)+AU2(gl_WorkGroupID.x<<4u,gl_WorkGroupID.y<<4u); // // Filter. // AF4 c; // CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c); // gxy.x+=8u; // CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c); // gxy.y+=8u; // CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c); // gxy.x-=8u; // CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);} // ... // // Example for semi-persistent 16x16 but this time for packed math. // // Use this before including 'cas.h' if not using the non-packed filter function. // #define CAS_PACKED_ONLY 1 // ... // layout(local_size_x=64)in; // void main(){ // // Fetch constants from CasSetup(). // AU4 const0=cb.const0; // AU4 const1=cb.const1; // // Do remapping of local xy in workgroup for a more PS-like swizzle pattern. // AU2 gxy=ARmp8x8(gl_LocalInvocationID.x)+AU2(gl_WorkGroupID.x<<4u,gl_WorkGroupID.y<<4u); // // Filter. // AH4 c0,c1;AH2 cR,cG,cB; // CasFilterH(cR,cG,cB,gxy,const0,const1,false); // // Extra work integrated after CAS would go here. // ... // // Suggest only running CasDepack() right before stores, to maintain packed math for any work after CasFilterH(). // CasDepack(c0,c1,cR,cG,cB); // imageStore(imgDst,ASU2(gxy),AF4(c0)); // imageStore(imgDst,ASU2(gxy)+ASU2(8,0),AF4(c1)); // gxy.y+=8u; // CasFilterH(cR,cG,cB,gxy,const0,const1,false); // ... // CasDepack(c0,c1,cR,cG,cB); // imageStore(imgDst,ASU2(gxy),AF4(c0)); // imageStore(imgDst,ASU2(gxy)+ASU2(8,0),AF4(c1));} //------------------------------------------------------------------------------------------------------------------------------ // CAS FILTERING LOGIC // =================== // CAS uses the minimal nearest 3x3 source texel window for filtering. // The filter coefficients are radially symmetric (phase adaptive, computed per pixel based on output pixel center). // The filter kernel adapts to local contrast (adjusting the negative lobe strength of the filter kernel). //------------------------------------------------------------------------------------------------------------------------------ // CAS INPUT REQUIREMENTS // ====================== // This is designed to be a linear filter. // Running CAS on perceptual inputs will yield over-sharpening. // Input must range between {0 to 1} for each color channel. // CAS output will be {0 to 1} ranged as well. // CAS does 5 loads, so any conversion applied during CasLoad() or CasInput() has a 5 load * 3 channel = 15x cost amplifier. // - So input conversions need to be factored into the prior pass's output. // - But if necessary use CasInput() instead of CasLoad(), as CasInput() works with packed color. // - For CAS with scaling the amplifier is 12 load * 3 channel = 36x cost amplifier. // Any conversion applied to output has a 3x cost amplifier (3 color channels). // - Output conversions are substantially less expensive. // Added VALU ops due to conversions will have visible cost as this shader is already quite VALU heavy. // This filter does not function well on sRGB or gamma 2.2 non-linear data. // This filter does not function on PQ non-linear data. // - Due to the shape of PQ, the positive side of the ring created by the negative lobe tends to become over-bright. //------------------------------------------------------------------------------------------------------------------------------ // INPUT FORMAT SPECIFIC CASES // =========================== // - FP16 with all non-negative values ranging {0 to 1}. // - Use as is, filter is designed for linear input and output ranging {0 to 1}. // --------------------------- // - UNORM with linear conversion approximation. // - This could be used for both sRGB or FreeSync2 native (gamma 2.2) cases. // - Load/store with either 10:10:10:2 UNORM or 8:8:8:8 UNORM (aka VK_FORMAT_R8G8B8A8_UNORM). // - Use gamma 2.0 conversion in CasInput(), as an approximation. // - Modifications: // // Change the CasInput*() function to square the inputs. // void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){r*=r;g*=g;b*=b;} // void CasInputH(inout AH2 r,inout AH2 g,inout AH2 b){r*=r;g*=g;b*=b;} // ... // // Do linear to gamma 2.0 before store. // // Since it will be common to do processing after CAS, the filter function returns linear. // c.r=sqrt(c.r);c.g=sqrt(c.g);c.b=sqrt(c.b); // imageStore(imgDst,ASU2(gxy),c); // ... // // And for packed. // CasFilterH(cR,cG,cB,gxy,const0,const1,true); // cR=sqrt(cR);cG=sqrt(cG);cB=sqrt(cB); // CasDepack(c0,c1,cR,cG,cB); // imageStore(img[0],ASU2(gxy),AF4(c0)); // imageStore(img[0],ASU2(gxy+AU2(8,0)),AF4(c1)); // --------------------------- // - sRGB with slightly better quality and higher cost. // - Use texelFetch() with sRGB format (VK_FORMAT_R8G8B8A8_SRGB) for loads (gets linear into shader). // - Store to destination using UNORM (not sRGB) stores and do the linear to sRGB conversion in the shader. // - Modifications: // // Use texel fetch instead of image load (on GCN this will translate into an image load in the driver). // // Hardware has sRGB to linear on loads (but in API only for read-only, aka texture instead of UAV/image). // AF3 CasLoad(ASU2 p){return texelFetch(texSrc,p,0).rgb;} // ... // // Do linear to sRGB before store (GPU lacking hardware conversion support for linear to sRGB on store). // c.r=AToSrgbF1(c.r);c.g=AToSrgbF1(c.g);c.b=AToSrgbF1(c.b); // imageStore(imgDst,ASU2(gxy),c); // ... // // And for packed. // CasFilterH(cR,cG,cB,gxy,const0,const1,true); // cR=AToSrgbH2(cR);cG=AToSrgbH2(cG);cB=AToSrgbH2(cB); // CasDepack(c0,c1,cR,cG,cB); // imageStore(img[0],ASU2(gxy),AF4(c0)); // imageStore(img[0],ASU2(gxy+AU2(8,0)),AF4(c1)); // --------------------------- // - HDR10 output via scRGB. // - Pass before CAS needs to write out linear Rec.2020 colorspace output (all positive values). // - Write to FP16 with {0 to 1} mapped to {0 to maxNits} nits. // - Where 'maxNits' is typically not 10000. // - Instead set 'maxNits' to the nits level that the HDR TV starts to clip white. // - This can be even as low as 1000 nits on some HDR TVs. // - After CAS do matrix multiply to take Rec.2020 back to sRGB and multiply by 'maxNits/80.0'. // - Showing GPU code below to generate constants, likely most need to use CPU code instead. // - Keeping the GPU code here because it is easier to read in these docs. // - Can use 'lpm.h' source to generate the conversion matrix for Rec.2020 to sRGB: // // Output conversion matrix from sRGB to Rec.2020. // AF3 conR,conG,conB; // // Working space temporaries (Rec.2020). // AF3 rgbToXyzXW;AF3 rgbToXyzYW;AF3 rgbToXyzZW; // LpmColRgbToXyz(rgbToXyzXW,rgbToXyzYW,rgbToXyzZW,lpmCol2020R,lpmCol2020G,lpmCol2020B,lpmColD65); // // Output space temporaries (Rec.709, same as sRGB primaries). // AF3 rgbToXyzXO;AF3 rgbToXyzYO;AF3 rgbToXyzZO; // LpmColRgbToXyz(rgbToXyzXO,rgbToXyzYO,rgbToXyzZO,lpmCol709R,lpmCol709G,lpmCol709B,lpmColD65); // AF3 xyzToRgbRO;AF3 xyzToRgbGO;AF3 xyzToRgbBO; // LpmMatInv3x3(xyzToRgbRO,xyzToRgbGO,xyzToRgbBO,rgbToXyzXO,rgbToXyzYO,rgbToXyzZO); // // Generate the matrix. // LpmMatMul3x3(conR,conG,conB,xyzToRgbRO,xyzToRgbGO,xyzToRgbBO,rgbToXyzXW,rgbToXyzYW,rgbToXyzZW); // - Adjust the conversion matrix for the multiply by 'maxNits/80.0'. // // After this the constants can be stored into a constant buffer. // AF1 conScale=maxNits*ARcpF1(80.0); // conR*=conScale;conG*=conScale;conB*=conScale; // - After CAS do the matrix multiply (passing the fetched constants into the shader). // outputR=dot(AF3(colorR,colorG,colorB),conR); // outputG=dot(AF3(colorR,colorG,colorB),conG); // outputB=dot(AF3(colorR,colorG,colorB),conB); // - Hopefully no developer is taking scRGB as input to CAS. // - If that was the case, the conversion matrix from sRGB to Rec.2020 can be built changing the above code. // - Swap the 'lpmCol709*' and 'lpmCol2020*' inputs to LpmColRgbToXyz(). // - Then scale by '80.0/maxNits' instead of 'maxNits/80.0'. // --------------------------- // - HDR10 output via native 10:10:10:2. // - Pass before CAS needs to write out linear Rec.2020 colorspace output (all positive values). // - Write to FP16 with {0 to 1} mapped to {0 to maxNits} nits. // - Where 'maxNits' is typically not 10000. // - Instead set 'maxNits' to the nits level that the HDR TV starts to clip white. // - This can be even as low as 1000 nits on some HDR TVs. // - Hopefully no developer needs to take PQ as input here, but if so can use A to convert PQ to linear: // // Where 'k0' is a constant of 'maxNits/10000.0'. // colorR=AFromPqF1(colorR*k0); // colorG=AFromPqF1(colorG*k0); // colorB=AFromPqF1(colorB*k0); // - After CAS convert from linear to PQ. // // Where 'k1' is a constant of '10000.0/maxNits'. // colorR=AToPqF1(colorR*k1); // colorG=AToPqF1(colorG*k1); // colorB=AToPqF1(colorB*k1); // --------------------------- // - Example of a bad idea for CAS input design. // - Have the pass before CAS store out in 10:10:10:2 UNORM with gamma 2.0. // - Store the output of CAS with sRGB to linear conversion, or with a gamma 2.2 conversion for FreeSync2 native. // - This will drop precision because the inputs had been quantized to 10-bit, // and the output is using a different tonal transform, // so inputs and outputs won't align for similar values. // - It might be "ok" for 8-bit/channel CAS output, but definately not a good idea for 10-bit/channel output. //------------------------------------------------------------------------------------------------------------------------------ // ALGORITHM DESCRIPTION // ===================== // This describes the algorithm with CAS_BETTER_DIAGONALS defined. // The default is with CAS_BETTER_DIAGONALS not defined (which is faster). // Starting with no scaling. // CAS fetches a 3x3 neighborhood around the pixel 'e', // a b c // d(e)f // g h i // It then computes a 'soft' minimum and maximum, // a b c b // d e f * 0.5 + d e f * 0.5 // g h i h // The minimum and maximums give an idea of local contrast. // --- 1.0 ^ // | | <-- This minimum distance to the signal limit is divided by MAX to get a base sharpening amount 'A'. // --- MAX v // | // | // --- MIN ^ // | | <-- The MIN side is more distant in this example so it is not used, but for dark colors it would be used. // | | // --- 0.0 v // The base sharpening amount 'A' from above is shaped with a sqrt(). // This 'A' ranges from 0 := no sharpening, to 1 := full sharpening. // Then 'A' is scaled by the sharpness knob while being transformed to a negative lobe (values from -1/5 to -1/8 for A=1). // The final filter kernel looks like this, // 0 A 0 // A 1 A <-- Center is always 1.0, followed by the negative lobe 'A' in a ring, and windowed into a circle with the 0.0s. // 0 A 0 // The local neighborhood is then multiplied by the kernel weights, summed and divided by the sum of the kernel weights. // The high quality path computes filter weights per channel. // The low quality path uses the green channel's filter weights to compute the 'A' factor for all channels. // --------------------- // The scaling path is a little more complex. // It starts by fetching the 4x4 neighborhood around the pixel centered between centers of pixels {f,g,j,k}, // a b c d // e(f g)h // i(j k)l // m n o p // The algorithm then computes the no-scaling result for {f,g,j,k}. // It then interpolates between those no-scaling results. // The interpolation is adaptive. // To hide bilinear interpolation and restore diagonals, it weights bilinear weights by 1/(const+contrast). // Where 'contrast' is the soft 'max-min'. // This makes edges thin out a little. // --------------------- // Without CAS_BETTER_DIAGONALS defined, the algorithm is a little faster. // Instead of using the 3x3 "box" with the 5-tap "circle" this uses just the "circle". // Drops to 5 texture fetches for no-scaling. // Drops to 12 texture fetches for scaling. // Drops a bunch of math. //------------------------------------------------------------------------------------------------------------------------------ // IDEAS FOR FUTURE // ================ // - Avoid V_CVT's by using denormals. // - Manually pack FP16 literals. //------------------------------------------------------------------------------------------------------------------------------ // CHANGE LOG // ========== // 20190610 - Misc documentation cleanup. // 20190609 - Removed lowQuality bool, improved scaling logic. // 20190530 - Unified CPU/GPU setup code, using new ffx_a.h, faster, define CAS_BETTER_DIAGONALS to get older slower one. // 20190529 - Missing a good way to re-interpret packed in HLSL, so disabling approximation optimizations for now. // 20190528 - Fixed so GPU CasSetup() generates half data all the time. // 20190527 - Implemented approximations for rcp() and sqrt(). // 20190524 - New algorithm, adjustable sharpness, scaling to 4x area. Fixed checker debug for no-scaling only. // 20190521 - Updated file naming. // 20190516 - Updated docs, fixed workaround, fixed no-scaling quality issue, removed gamma2 and generalized as CasInput*(). // 20190510 - Made the dispatch example safely round up for images that are not a multiple of 16x16. // 20190507 - Fixed typo bug in CAS_DEBUG_CHECKER, fixed sign typo in the docs. // 20190503 - Setup temporary workaround for compiler bug. // 20190502 - Added argument for 'gamma2' path so input transform in that case runs packed. // 20190426 - Improved documentation on format specific cases, etc. // 20190425 - Updated/corrected documentation. // 20190405 - Added CAS_PACKED_ONLY, misc bug fixes. // 20190404 - Updated for the new a.h header. //============================================================================================================================== // This is the practical limit for the algorithm's scaling ability (quality is limited by 3x3 taps). Example resolutions, // 1280x720 -> 1080p = 2.25x area // 1536x864 -> 1080p = 1.56x area // 1792x1008 -> 1440p = 2.04x area // 1920x1080 -> 1440p = 1.78x area // 1920x1080 -> 4K = 4.0x area // 2048x1152 -> 1440p = 1.56x area // 2560x1440 -> 4K = 2.25x area // 3072x1728 -> 4K = 1.56x area #define CAS_AREA_LIMIT 4.0 //------------------------------------------------------------------------------------------------------------------------------ // Pass in output and input resolution in pixels. // This returns true if CAS supports scaling in the given configuration. AP1 CasSupportScaling(AF1 outX,AF1 outY,AF1 inX,AF1 inY){return ((outX*outY)*ARcpF1(inX*inY))<=CAS_AREA_LIMIT;} //============================================================================================================================== // Call to setup required constant values (works on CPU or GPU). A_STATIC void CasSetup( outAU4 const0, outAU4 const1, AF1 sharpness, // 0 := default (lower ringing), 1 := maximum (higest ringing) AF1 inputSizeInPixelsX, AF1 inputSizeInPixelsY, AF1 outputSizeInPixelsX, AF1 outputSizeInPixelsY){ // Scaling terms. const0[0]=AU1_AF1(inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX)); const0[1]=AU1_AF1(inputSizeInPixelsY*ARcpF1(outputSizeInPixelsY)); const0[2]=AU1_AF1(AF1_(0.5)*inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5)); const0[3]=AU1_AF1(AF1_(0.5)*inputSizeInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5)); // Sharpness value. AF1 sharp=-ARcpF1(ALerpF1(8.0,5.0,ASatF1(sharpness))); varAF2(hSharp)=initAF2(sharp,0.0); const1[0]=AU1_AF1(sharp); const1[1]=AU1_AH2_AF2(hSharp); const1[2]=AU1_AF1(AF1_(8.0)*inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX)); const1[3]=AU1(0);} //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //_____________________________________________________________/\_______________________________________________________________ //============================================================================================================================== // NON-PACKED VERSION //============================================================================================================================== #ifdef A_GPU #if defined(A_MSL) && !defined(CAS_TEXTURE) #define CAS_TEXTURE texture2d #endif #ifdef CAS_TEXTURE #define TEXCALL tex, #define TEXINPUT CAS_TEXTURE tex, #else #define TEXCALL #define TEXINPUT #endif #ifdef CAS_PACKED_ONLY // Avoid compiler error. A_STATIC AF3 CasLoad(ASU2 p){return AF3(0.0,0.0,0.0);} // MSL Doesn't let you inout vector elements, so use a macro #define CasInput(r,g,b) #endif //------------------------------------------------------------------------------------------------------------------------------ A_STATIC void CasFilter( TEXINPUT outAF1 pixR, // Output values, non-vector so port between CasFilter() and CasFilterH() is easy. outAF1 pixG, outAF1 pixB, AU2 ip, // Integer pixel position in output. AU4 const0, // Constants generated by CasSetup(). AU4 const1, AP1 noScaling){ // Must be a compile-time literal value, true = sharpen only (no resize). //------------------------------------------------------------------------------------------------------------------------------ // Debug a checker pattern of on/off tiles for visual inspection. #ifdef CAS_DEBUG_CHECKER if((((ip.x^ip.y)>>8u)&1u)==0u){AF3 pix0=CasLoad(ASU2(ip)); pixR=pix0.r;pixG=pix0.g;pixB=pix0.b;CasInput(pixR,pixG,pixB);return;} #endif //------------------------------------------------------------------------------------------------------------------------------ // No scaling algorithm uses minimal 3x3 pixel neighborhood. if(noScaling){ // a b c // d e f // g h i ASU2 sp=ASU2(ip); A_MAYBE_UNUSED AF3 a=CasLoad(TEXCALL sp+ASU2(-1,-1)); A_MAYBE_UNUSED AF3 b=CasLoad(TEXCALL sp+ASU2( 0,-1)); A_MAYBE_UNUSED AF3 c=CasLoad(TEXCALL sp+ASU2( 1,-1)); A_MAYBE_UNUSED AF3 d=CasLoad(TEXCALL sp+ASU2(-1, 0)); A_MAYBE_UNUSED AF3 e=CasLoad(TEXCALL sp); A_MAYBE_UNUSED AF3 f=CasLoad(TEXCALL sp+ASU2( 1, 0)); A_MAYBE_UNUSED AF3 g=CasLoad(TEXCALL sp+ASU2(-1, 1)); A_MAYBE_UNUSED AF3 h=CasLoad(TEXCALL sp+ASU2( 0, 1)); A_MAYBE_UNUSED AF3 i=CasLoad(TEXCALL sp+ASU2( 1, 1)); // Run optional input transform. CasInput(a.r,a.g,a.b); CasInput(b.r,b.g,b.b); CasInput(c.r,c.g,c.b); CasInput(d.r,d.g,d.b); CasInput(e.r,e.g,e.b); CasInput(f.r,f.g,f.b); CasInput(g.r,g.g,g.b); CasInput(h.r,h.g,h.b); CasInput(i.r,i.g,i.b); // Soft min and max. // a b c b // d e f * 0.5 + d e f * 0.5 // g h i h // These are 2.0x bigger (factored out the extra multiply). AF1 mnR=AMin3F1(AMin3F1(d.r,e.r,f.r),b.r,h.r); AF1 mnG=AMin3F1(AMin3F1(d.g,e.g,f.g),b.g,h.g); AF1 mnB=AMin3F1(AMin3F1(d.b,e.b,f.b),b.b,h.b); #ifdef CAS_BETTER_DIAGONALS AF1 mnR2=AMin3F1(AMin3F1(mnR,a.r,c.r),g.r,i.r); AF1 mnG2=AMin3F1(AMin3F1(mnG,a.g,c.g),g.g,i.g); AF1 mnB2=AMin3F1(AMin3F1(mnB,a.b,c.b),g.b,i.b); mnR=mnR+mnR2; mnG=mnG+mnG2; mnB=mnB+mnB2; #endif AF1 mxR=AMax3F1(AMax3F1(d.r,e.r,f.r),b.r,h.r); AF1 mxG=AMax3F1(AMax3F1(d.g,e.g,f.g),b.g,h.g); AF1 mxB=AMax3F1(AMax3F1(d.b,e.b,f.b),b.b,h.b); #ifdef CAS_BETTER_DIAGONALS AF1 mxR2=AMax3F1(AMax3F1(mxR,a.r,c.r),g.r,i.r); AF1 mxG2=AMax3F1(AMax3F1(mxG,a.g,c.g),g.g,i.g); AF1 mxB2=AMax3F1(AMax3F1(mxB,a.b,c.b),g.b,i.b); mxR=mxR+mxR2; mxG=mxG+mxG2; mxB=mxB+mxB2; #endif // Smooth minimum distance to signal limit divided by smooth max. #ifdef CAS_GO_SLOWER AF1 rcpMR=ARcpF1(mxR); AF1 rcpMG=ARcpF1(mxG); AF1 rcpMB=ARcpF1(mxB); #else AF1 rcpMR=APrxLoRcpF1(mxR); AF1 rcpMG=APrxLoRcpF1(mxG); AF1 rcpMB=APrxLoRcpF1(mxB); #endif #ifdef CAS_BETTER_DIAGONALS AF1 ampR=ASatF1(min(mnR,AF1_(2.0)-mxR)*rcpMR); AF1 ampG=ASatF1(min(mnG,AF1_(2.0)-mxG)*rcpMG); AF1 ampB=ASatF1(min(mnB,AF1_(2.0)-mxB)*rcpMB); #else AF1 ampR=ASatF1(min(mnR,AF1_(1.0)-mxR)*rcpMR); AF1 ampG=ASatF1(min(mnG,AF1_(1.0)-mxG)*rcpMG); AF1 ampB=ASatF1(min(mnB,AF1_(1.0)-mxB)*rcpMB); #endif // Shaping amount of sharpening. #ifdef CAS_GO_SLOWER ampR=sqrt(ampR); ampG=sqrt(ampG); ampB=sqrt(ampB); #else ampR=APrxLoSqrtF1(ampR); ampG=APrxLoSqrtF1(ampG); ampB=APrxLoSqrtF1(ampB); #endif // Filter shape. // 0 w 0 // w 1 w // 0 w 0 A_MAYBE_UNUSED AF1 peak=AF1_AU1(const1.x); A_MAYBE_UNUSED AF1 wR=ampR*peak; A_MAYBE_UNUSED AF1 wG=ampG*peak; A_MAYBE_UNUSED AF1 wB=ampB*peak; // Filter. #ifndef CAS_SLOW // Using green coef only, depending on dead code removal to strip out the extra overhead. #ifdef CAS_GO_SLOWER AF1 rcpWeight=ARcpF1(AF1_(1.0)+AF1_(4.0)*wG); #else AF1 rcpWeight=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wG); #endif pixR=ASatF1((b.r*wG+d.r*wG+f.r*wG+h.r*wG+e.r)*rcpWeight); pixG=ASatF1((b.g*wG+d.g*wG+f.g*wG+h.g*wG+e.g)*rcpWeight); pixB=ASatF1((b.b*wG+d.b*wG+f.b*wG+h.b*wG+e.b)*rcpWeight); #else #ifdef CAS_GO_SLOWER AF1 rcpWeightR=ARcpF1(AF1_(1.0)+AF1_(4.0)*wR); AF1 rcpWeightG=ARcpF1(AF1_(1.0)+AF1_(4.0)*wG); AF1 rcpWeightB=ARcpF1(AF1_(1.0)+AF1_(4.0)*wB); #else AF1 rcpWeightR=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wR); AF1 rcpWeightG=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wG); AF1 rcpWeightB=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wB); #endif pixR=ASatF1((b.r*wR+d.r*wR+f.r*wR+h.r*wR+e.r)*rcpWeightR); pixG=ASatF1((b.g*wG+d.g*wG+f.g*wG+h.g*wG+e.g)*rcpWeightG); pixB=ASatF1((b.b*wB+d.b*wB+f.b*wB+h.b*wB+e.b)*rcpWeightB); #endif return;} //------------------------------------------------------------------------------------------------------------------------------ // Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm. // a b c d // e f g h // i j k l // m n o p // Working these 4 results. // +-----+-----+ // | | | // | f..|..g | // | . | . | // +-----+-----+ // | . | . | // | j..|..k | // | | | // +-----+-----+ AF2 pp=AF2(ip)*AF2_AU2(const0.xy)+AF2_AU2(const0.zw); AF2 fp=floor(pp); pp-=fp; ASU2 sp=ASU2(fp); A_MAYBE_UNUSED AF3 a=CasLoad(TEXCALL sp+ASU2(-1,-1)); A_MAYBE_UNUSED AF3 b=CasLoad(TEXCALL sp+ASU2( 0,-1)); A_MAYBE_UNUSED AF3 e=CasLoad(TEXCALL sp+ASU2(-1, 0)); A_MAYBE_UNUSED AF3 f=CasLoad(TEXCALL sp); A_MAYBE_UNUSED AF3 c=CasLoad(TEXCALL sp+ASU2( 1,-1)); A_MAYBE_UNUSED AF3 d=CasLoad(TEXCALL sp+ASU2( 2,-1)); A_MAYBE_UNUSED AF3 g=CasLoad(TEXCALL sp+ASU2( 1, 0)); A_MAYBE_UNUSED AF3 h=CasLoad(TEXCALL sp+ASU2( 2, 0)); A_MAYBE_UNUSED AF3 i=CasLoad(TEXCALL sp+ASU2(-1, 1)); A_MAYBE_UNUSED AF3 j=CasLoad(TEXCALL sp+ASU2( 0, 1)); A_MAYBE_UNUSED AF3 m=CasLoad(TEXCALL sp+ASU2(-1, 2)); A_MAYBE_UNUSED AF3 n=CasLoad(TEXCALL sp+ASU2( 0, 2)); A_MAYBE_UNUSED AF3 k=CasLoad(TEXCALL sp+ASU2( 1, 1)); A_MAYBE_UNUSED AF3 l=CasLoad(TEXCALL sp+ASU2( 2, 1)); A_MAYBE_UNUSED AF3 o=CasLoad(TEXCALL sp+ASU2( 1, 2)); A_MAYBE_UNUSED AF3 p=CasLoad(TEXCALL sp+ASU2( 2, 2)); // Run optional input transform. CasInput(a.r,a.g,a.b); CasInput(b.r,b.g,b.b); CasInput(c.r,c.g,c.b); CasInput(d.r,d.g,d.b); CasInput(e.r,e.g,e.b); CasInput(f.r,f.g,f.b); CasInput(g.r,g.g,g.b); CasInput(h.r,h.g,h.b); CasInput(i.r,i.g,i.b); CasInput(j.r,j.g,j.b); CasInput(k.r,k.g,k.b); CasInput(l.r,l.g,l.b); CasInput(m.r,m.g,m.b); CasInput(n.r,n.g,n.b); CasInput(o.r,o.g,o.b); CasInput(p.r,p.g,p.b); // Soft min and max. // These are 2.0x bigger (factored out the extra multiply). // a b c b // e f g * 0.5 + e f g * 0.5 [F] // i j k j AF1 mnfR=AMin3F1(AMin3F1(b.r,e.r,f.r),g.r,j.r); AF1 mnfG=AMin3F1(AMin3F1(b.g,e.g,f.g),g.g,j.g); AF1 mnfB=AMin3F1(AMin3F1(b.b,e.b,f.b),g.b,j.b); #ifdef CAS_BETTER_DIAGONALS AF1 mnfR2=AMin3F1(AMin3F1(mnfR,a.r,c.r),i.r,k.r); AF1 mnfG2=AMin3F1(AMin3F1(mnfG,a.g,c.g),i.g,k.g); AF1 mnfB2=AMin3F1(AMin3F1(mnfB,a.b,c.b),i.b,k.b); mnfR=mnfR+mnfR2; mnfG=mnfG+mnfG2; mnfB=mnfB+mnfB2; #endif AF1 mxfR=AMax3F1(AMax3F1(b.r,e.r,f.r),g.r,j.r); AF1 mxfG=AMax3F1(AMax3F1(b.g,e.g,f.g),g.g,j.g); AF1 mxfB=AMax3F1(AMax3F1(b.b,e.b,f.b),g.b,j.b); #ifdef CAS_BETTER_DIAGONALS AF1 mxfR2=AMax3F1(AMax3F1(mxfR,a.r,c.r),i.r,k.r); AF1 mxfG2=AMax3F1(AMax3F1(mxfG,a.g,c.g),i.g,k.g); AF1 mxfB2=AMax3F1(AMax3F1(mxfB,a.b,c.b),i.b,k.b); mxfR=mxfR+mxfR2; mxfG=mxfG+mxfG2; mxfB=mxfB+mxfB2; #endif // b c d c // f g h * 0.5 + f g h * 0.5 [G] // j k l k AF1 mngR=AMin3F1(AMin3F1(c.r,f.r,g.r),h.r,k.r); AF1 mngG=AMin3F1(AMin3F1(c.g,f.g,g.g),h.g,k.g); AF1 mngB=AMin3F1(AMin3F1(c.b,f.b,g.b),h.b,k.b); #ifdef CAS_BETTER_DIAGONALS AF1 mngR2=AMin3F1(AMin3F1(mngR,b.r,d.r),j.r,l.r); AF1 mngG2=AMin3F1(AMin3F1(mngG,b.g,d.g),j.g,l.g); AF1 mngB2=AMin3F1(AMin3F1(mngB,b.b,d.b),j.b,l.b); mngR=mngR+mngR2; mngG=mngG+mngG2; mngB=mngB+mngB2; #endif AF1 mxgR=AMax3F1(AMax3F1(c.r,f.r,g.r),h.r,k.r); AF1 mxgG=AMax3F1(AMax3F1(c.g,f.g,g.g),h.g,k.g); AF1 mxgB=AMax3F1(AMax3F1(c.b,f.b,g.b),h.b,k.b); #ifdef CAS_BETTER_DIAGONALS AF1 mxgR2=AMax3F1(AMax3F1(mxgR,b.r,d.r),j.r,l.r); AF1 mxgG2=AMax3F1(AMax3F1(mxgG,b.g,d.g),j.g,l.g); AF1 mxgB2=AMax3F1(AMax3F1(mxgB,b.b,d.b),j.b,l.b); mxgR=mxgR+mxgR2; mxgG=mxgG+mxgG2; mxgB=mxgB+mxgB2; #endif // e f g f // i j k * 0.5 + i j k * 0.5 [J] // m n o n AF1 mnjR=AMin3F1(AMin3F1(f.r,i.r,j.r),k.r,n.r); AF1 mnjG=AMin3F1(AMin3F1(f.g,i.g,j.g),k.g,n.g); AF1 mnjB=AMin3F1(AMin3F1(f.b,i.b,j.b),k.b,n.b); #ifdef CAS_BETTER_DIAGONALS AF1 mnjR2=AMin3F1(AMin3F1(mnjR,e.r,g.r),m.r,o.r); AF1 mnjG2=AMin3F1(AMin3F1(mnjG,e.g,g.g),m.g,o.g); AF1 mnjB2=AMin3F1(AMin3F1(mnjB,e.b,g.b),m.b,o.b); mnjR=mnjR+mnjR2; mnjG=mnjG+mnjG2; mnjB=mnjB+mnjB2; #endif AF1 mxjR=AMax3F1(AMax3F1(f.r,i.r,j.r),k.r,n.r); AF1 mxjG=AMax3F1(AMax3F1(f.g,i.g,j.g),k.g,n.g); AF1 mxjB=AMax3F1(AMax3F1(f.b,i.b,j.b),k.b,n.b); #ifdef CAS_BETTER_DIAGONALS AF1 mxjR2=AMax3F1(AMax3F1(mxjR,e.r,g.r),m.r,o.r); AF1 mxjG2=AMax3F1(AMax3F1(mxjG,e.g,g.g),m.g,o.g); AF1 mxjB2=AMax3F1(AMax3F1(mxjB,e.b,g.b),m.b,o.b); mxjR=mxjR+mxjR2; mxjG=mxjG+mxjG2; mxjB=mxjB+mxjB2; #endif // f g h g // j k l * 0.5 + j k l * 0.5 [K] // n o p o AF1 mnkR=AMin3F1(AMin3F1(g.r,j.r,k.r),l.r,o.r); AF1 mnkG=AMin3F1(AMin3F1(g.g,j.g,k.g),l.g,o.g); AF1 mnkB=AMin3F1(AMin3F1(g.b,j.b,k.b),l.b,o.b); #ifdef CAS_BETTER_DIAGONALS AF1 mnkR2=AMin3F1(AMin3F1(mnkR,f.r,h.r),n.r,p.r); AF1 mnkG2=AMin3F1(AMin3F1(mnkG,f.g,h.g),n.g,p.g); AF1 mnkB2=AMin3F1(AMin3F1(mnkB,f.b,h.b),n.b,p.b); mnkR=mnkR+mnkR2; mnkG=mnkG+mnkG2; mnkB=mnkB+mnkB2; #endif AF1 mxkR=AMax3F1(AMax3F1(g.r,j.r,k.r),l.r,o.r); AF1 mxkG=AMax3F1(AMax3F1(g.g,j.g,k.g),l.g,o.g); AF1 mxkB=AMax3F1(AMax3F1(g.b,j.b,k.b),l.b,o.b); #ifdef CAS_BETTER_DIAGONALS AF1 mxkR2=AMax3F1(AMax3F1(mxkR,f.r,h.r),n.r,p.r); AF1 mxkG2=AMax3F1(AMax3F1(mxkG,f.g,h.g),n.g,p.g); AF1 mxkB2=AMax3F1(AMax3F1(mxkB,f.b,h.b),n.b,p.b); mxkR=mxkR+mxkR2; mxkG=mxkG+mxkG2; mxkB=mxkB+mxkB2; #endif // Smooth minimum distance to signal limit divided by smooth max. #ifdef CAS_GO_SLOWER AF1 rcpMfR=ARcpF1(mxfR); AF1 rcpMfG=ARcpF1(mxfG); AF1 rcpMfB=ARcpF1(mxfB); AF1 rcpMgR=ARcpF1(mxgR); AF1 rcpMgG=ARcpF1(mxgG); AF1 rcpMgB=ARcpF1(mxgB); AF1 rcpMjR=ARcpF1(mxjR); AF1 rcpMjG=ARcpF1(mxjG); AF1 rcpMjB=ARcpF1(mxjB); AF1 rcpMkR=ARcpF1(mxkR); AF1 rcpMkG=ARcpF1(mxkG); AF1 rcpMkB=ARcpF1(mxkB); #else AF1 rcpMfR=APrxLoRcpF1(mxfR); AF1 rcpMfG=APrxLoRcpF1(mxfG); AF1 rcpMfB=APrxLoRcpF1(mxfB); AF1 rcpMgR=APrxLoRcpF1(mxgR); AF1 rcpMgG=APrxLoRcpF1(mxgG); AF1 rcpMgB=APrxLoRcpF1(mxgB); AF1 rcpMjR=APrxLoRcpF1(mxjR); AF1 rcpMjG=APrxLoRcpF1(mxjG); AF1 rcpMjB=APrxLoRcpF1(mxjB); AF1 rcpMkR=APrxLoRcpF1(mxkR); AF1 rcpMkG=APrxLoRcpF1(mxkG); AF1 rcpMkB=APrxLoRcpF1(mxkB); #endif #ifdef CAS_BETTER_DIAGONALS AF1 ampfR=ASatF1(min(mnfR,AF1_(2.0)-mxfR)*rcpMfR); AF1 ampfG=ASatF1(min(mnfG,AF1_(2.0)-mxfG)*rcpMfG); AF1 ampfB=ASatF1(min(mnfB,AF1_(2.0)-mxfB)*rcpMfB); AF1 ampgR=ASatF1(min(mngR,AF1_(2.0)-mxgR)*rcpMgR); AF1 ampgG=ASatF1(min(mngG,AF1_(2.0)-mxgG)*rcpMgG); AF1 ampgB=ASatF1(min(mngB,AF1_(2.0)-mxgB)*rcpMgB); AF1 ampjR=ASatF1(min(mnjR,AF1_(2.0)-mxjR)*rcpMjR); AF1 ampjG=ASatF1(min(mnjG,AF1_(2.0)-mxjG)*rcpMjG); AF1 ampjB=ASatF1(min(mnjB,AF1_(2.0)-mxjB)*rcpMjB); AF1 ampkR=ASatF1(min(mnkR,AF1_(2.0)-mxkR)*rcpMkR); AF1 ampkG=ASatF1(min(mnkG,AF1_(2.0)-mxkG)*rcpMkG); AF1 ampkB=ASatF1(min(mnkB,AF1_(2.0)-mxkB)*rcpMkB); #else AF1 ampfR=ASatF1(min(mnfR,AF1_(1.0)-mxfR)*rcpMfR); AF1 ampfG=ASatF1(min(mnfG,AF1_(1.0)-mxfG)*rcpMfG); AF1 ampfB=ASatF1(min(mnfB,AF1_(1.0)-mxfB)*rcpMfB); AF1 ampgR=ASatF1(min(mngR,AF1_(1.0)-mxgR)*rcpMgR); AF1 ampgG=ASatF1(min(mngG,AF1_(1.0)-mxgG)*rcpMgG); AF1 ampgB=ASatF1(min(mngB,AF1_(1.0)-mxgB)*rcpMgB); AF1 ampjR=ASatF1(min(mnjR,AF1_(1.0)-mxjR)*rcpMjR); AF1 ampjG=ASatF1(min(mnjG,AF1_(1.0)-mxjG)*rcpMjG); AF1 ampjB=ASatF1(min(mnjB,AF1_(1.0)-mxjB)*rcpMjB); AF1 ampkR=ASatF1(min(mnkR,AF1_(1.0)-mxkR)*rcpMkR); AF1 ampkG=ASatF1(min(mnkG,AF1_(1.0)-mxkG)*rcpMkG); AF1 ampkB=ASatF1(min(mnkB,AF1_(1.0)-mxkB)*rcpMkB); #endif // Shaping amount of sharpening. #ifdef CAS_GO_SLOWER ampfR=sqrt(ampfR); ampfG=sqrt(ampfG); ampfB=sqrt(ampfB); ampgR=sqrt(ampgR); ampgG=sqrt(ampgG); ampgB=sqrt(ampgB); ampjR=sqrt(ampjR); ampjG=sqrt(ampjG); ampjB=sqrt(ampjB); ampkR=sqrt(ampkR); ampkG=sqrt(ampkG); ampkB=sqrt(ampkB); #else ampfR=APrxLoSqrtF1(ampfR); ampfG=APrxLoSqrtF1(ampfG); ampfB=APrxLoSqrtF1(ampfB); ampgR=APrxLoSqrtF1(ampgR); ampgG=APrxLoSqrtF1(ampgG); ampgB=APrxLoSqrtF1(ampgB); ampjR=APrxLoSqrtF1(ampjR); ampjG=APrxLoSqrtF1(ampjG); ampjB=APrxLoSqrtF1(ampjB); ampkR=APrxLoSqrtF1(ampkR); ampkG=APrxLoSqrtF1(ampkG); ampkB=APrxLoSqrtF1(ampkB); #endif // Filter shape. // 0 w 0 // w 1 w // 0 w 0 AF1 peak=AF1_AU1(const1.x); AF1 wfR=ampfR*peak; AF1 wfG=ampfG*peak; AF1 wfB=ampfB*peak; AF1 wgR=ampgR*peak; AF1 wgG=ampgG*peak; AF1 wgB=ampgB*peak; AF1 wjR=ampjR*peak; AF1 wjG=ampjG*peak; AF1 wjB=ampjB*peak; AF1 wkR=ampkR*peak; AF1 wkG=ampkG*peak; AF1 wkB=ampkB*peak; // Blend between 4 results. // s t // u v AF1 s=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y); AF1 t= pp.x *(AF1_(1.0)-pp.y); AF1 u=(AF1_(1.0)-pp.x)* pp.y ; AF1 v= pp.x * pp.y ; // Thin edges to hide bilinear interpolation (helps diagonals). AF1 thinB=1.0/32.0; #ifdef CAS_GO_SLOWER s*=ARcpF1(thinB+(mxfG-mnfG)); t*=ARcpF1(thinB+(mxgG-mngG)); u*=ARcpF1(thinB+(mxjG-mnjG)); v*=ARcpF1(thinB+(mxkG-mnkG)); #else s*=APrxLoRcpF1(thinB+(mxfG-mnfG)); t*=APrxLoRcpF1(thinB+(mxgG-mngG)); u*=APrxLoRcpF1(thinB+(mxjG-mnjG)); v*=APrxLoRcpF1(thinB+(mxkG-mnkG)); #endif // Final weighting. // b c // e f g h // i j k l // n o // _____ _____ _____ _____ // fs gt // // _____ _____ _____ _____ // fs s gt fs t gt // ju kv // _____ _____ _____ _____ // fs gt // ju u kv ju v kv // _____ _____ _____ _____ // // ju kv A_MAYBE_UNUSED AF1 qbeR=wfR*s; A_MAYBE_UNUSED AF1 qbeG=wfG*s; A_MAYBE_UNUSED AF1 qbeB=wfB*s; A_MAYBE_UNUSED AF1 qchR=wgR*t; A_MAYBE_UNUSED AF1 qchG=wgG*t; A_MAYBE_UNUSED AF1 qchB=wgB*t; A_MAYBE_UNUSED AF1 qfR=wgR*t+wjR*u+s; A_MAYBE_UNUSED AF1 qfG=wgG*t+wjG*u+s; A_MAYBE_UNUSED AF1 qfB=wgB*t+wjB*u+s; A_MAYBE_UNUSED AF1 qgR=wfR*s+wkR*v+t; A_MAYBE_UNUSED AF1 qgG=wfG*s+wkG*v+t; A_MAYBE_UNUSED AF1 qgB=wfB*s+wkB*v+t; A_MAYBE_UNUSED AF1 qjR=wfR*s+wkR*v+u; A_MAYBE_UNUSED AF1 qjG=wfG*s+wkG*v+u; A_MAYBE_UNUSED AF1 qjB=wfB*s+wkB*v+u; A_MAYBE_UNUSED AF1 qkR=wgR*t+wjR*u+v; A_MAYBE_UNUSED AF1 qkG=wgG*t+wjG*u+v; A_MAYBE_UNUSED AF1 qkB=wgB*t+wjB*u+v; A_MAYBE_UNUSED AF1 qinR=wjR*u; A_MAYBE_UNUSED AF1 qinG=wjG*u; A_MAYBE_UNUSED AF1 qinB=wjB*u; A_MAYBE_UNUSED AF1 qloR=wkR*v; A_MAYBE_UNUSED AF1 qloG=wkG*v; A_MAYBE_UNUSED AF1 qloB=wkB*v; // Filter. #ifndef CAS_SLOW // Using green coef only, depending on dead code removal to strip out the extra overhead. #ifdef CAS_GO_SLOWER AF1 rcpWG=ARcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG); #else AF1 rcpWG=APrxMedRcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG); #endif pixR=ASatF1((b.r*qbeG+e.r*qbeG+c.r*qchG+h.r*qchG+i.r*qinG+n.r*qinG+l.r*qloG+o.r*qloG+f.r*qfG+g.r*qgG+j.r*qjG+k.r*qkG)*rcpWG); pixG=ASatF1((b.g*qbeG+e.g*qbeG+c.g*qchG+h.g*qchG+i.g*qinG+n.g*qinG+l.g*qloG+o.g*qloG+f.g*qfG+g.g*qgG+j.g*qjG+k.g*qkG)*rcpWG); pixB=ASatF1((b.b*qbeG+e.b*qbeG+c.b*qchG+h.b*qchG+i.b*qinG+n.b*qinG+l.b*qloG+o.b*qloG+f.b*qfG+g.b*qgG+j.b*qjG+k.b*qkG)*rcpWG); #else #ifdef CAS_GO_SLOWER AF1 rcpWR=ARcpF1(AF1_(2.0)*qbeR+AF1_(2.0)*qchR+AF1_(2.0)*qinR+AF1_(2.0)*qloR+qfR+qgR+qjR+qkR); AF1 rcpWG=ARcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG); AF1 rcpWB=ARcpF1(AF1_(2.0)*qbeB+AF1_(2.0)*qchB+AF1_(2.0)*qinB+AF1_(2.0)*qloB+qfB+qgB+qjB+qkB); #else AF1 rcpWR=APrxMedRcpF1(AF1_(2.0)*qbeR+AF1_(2.0)*qchR+AF1_(2.0)*qinR+AF1_(2.0)*qloR+qfR+qgR+qjR+qkR); AF1 rcpWG=APrxMedRcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG); AF1 rcpWB=APrxMedRcpF1(AF1_(2.0)*qbeB+AF1_(2.0)*qchB+AF1_(2.0)*qinB+AF1_(2.0)*qloB+qfB+qgB+qjB+qkB); #endif pixR=ASatF1((b.r*qbeR+e.r*qbeR+c.r*qchR+h.r*qchR+i.r*qinR+n.r*qinR+l.r*qloR+o.r*qloR+f.r*qfR+g.r*qgR+j.r*qjR+k.r*qkR)*rcpWR); pixG=ASatF1((b.g*qbeG+e.g*qbeG+c.g*qchG+h.g*qchG+i.g*qinG+n.g*qinG+l.g*qloG+o.g*qloG+f.g*qfG+g.g*qgG+j.g*qjG+k.g*qkG)*rcpWG); pixB=ASatF1((b.b*qbeB+e.b*qbeB+c.b*qchB+h.b*qchB+i.b*qinB+n.b*qinB+l.b*qloB+o.b*qloB+f.b*qfB+g.b*qgB+j.b*qjB+k.b*qkB)*rcpWB); #endif } #undef TEXINPUT #undef TEXCALL #endif //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //_____________________________________________________________/\_______________________________________________________________ //============================================================================================================================== // PACKED VERSION //============================================================================================================================== #if defined(A_GPU) && defined(A_HALF) #if defined(A_MSL) && !defined(CAS_TEXTUREH) #define CAS_TEXTUREH texture2d #endif #ifdef CAS_TEXTUREH #define TEXCALL tex, #define TEXINPUT CAS_TEXTUREH tex, #else #define TEXCALL #define TEXINPUT #endif // Missing a way to do packed re-interpetation, so must disable approximation optimizations. #ifdef A_HLSL #ifndef CAS_GO_SLOWER #define CAS_GO_SLOWER 1 #endif #endif //============================================================================================================================== // Can be used to convert from packed SOA to AOS for store. void CasDepack(outAH4 pix0,outAH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){ #ifdef A_HLSL // Invoke a slower path for DX only, since it won't allow uninitialized values. pix0.a=pix1.a=0.0; #endif pix0.rgb=AH3(pixR.x,pixG.x,pixB.x); pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);} //============================================================================================================================== void CasFilterH( TEXINPUT // Output values are for 2 8x8 tiles in a 16x8 region. // pix.x = right 8x8 tile // pix.y = left 8x8 tile // This enables later processing to easily be packed as well. outAH2 pixR, outAH2 pixG, outAH2 pixB, AU2 ip, // Integer pixel position in output. AU4 const0, // Constants generated by CasSetup(). AU4 const1, AP1 noScaling){ // Must be a compile-time literal value, true = sharpen only (no resize). //------------------------------------------------------------------------------------------------------------------------------ // Debug a checker pattern of on/off tiles for visual inspection. #ifdef CAS_DEBUG_CHECKER if((((ip.x^ip.y)>>8u)&1u)==0u){AH3 pix0=CasLoadH(ASW2(ip));AH3 pix1=CasLoadH(ASW2(ip)+ASW2(8,0)); pixR=AH2(pix0.r,pix1.r);pixG=AH2(pix0.g,pix1.g);pixB=AH2(pix0.b,pix1.b);CasInputH(pixR,pixG,pixB);return;} #endif //------------------------------------------------------------------------------------------------------------------------------ // No scaling algorithm uses minimal 3x3 pixel neighborhood. if(noScaling){ ASW2 sp0=ASW2(ip); AH3 a0=CasLoadH(TEXCALL sp0+ASW2(-1,-1)); AH3 b0=CasLoadH(TEXCALL sp0+ASW2( 0,-1)); AH3 c0=CasLoadH(TEXCALL sp0+ASW2( 1,-1)); AH3 d0=CasLoadH(TEXCALL sp0+ASW2(-1, 0)); AH3 e0=CasLoadH(TEXCALL sp0); AH3 f0=CasLoadH(TEXCALL sp0+ASW2( 1, 0)); AH3 g0=CasLoadH(TEXCALL sp0+ASW2(-1, 1)); AH3 h0=CasLoadH(TEXCALL sp0+ASW2( 0, 1)); AH3 i0=CasLoadH(TEXCALL sp0+ASW2( 1, 1)); ASW2 sp1=sp0+ASW2(8,0); AH3 a1=CasLoadH(TEXCALL sp1+ASW2(-1,-1)); AH3 b1=CasLoadH(TEXCALL sp1+ASW2( 0,-1)); AH3 c1=CasLoadH(TEXCALL sp1+ASW2( 1,-1)); AH3 d1=CasLoadH(TEXCALL sp1+ASW2(-1, 0)); AH3 e1=CasLoadH(TEXCALL sp1); AH3 f1=CasLoadH(TEXCALL sp1+ASW2( 1, 0)); AH3 g1=CasLoadH(TEXCALL sp1+ASW2(-1, 1)); AH3 h1=CasLoadH(TEXCALL sp1+ASW2( 0, 1)); AH3 i1=CasLoadH(TEXCALL sp1+ASW2( 1, 1)); // AOS to SOA conversion. AH2 aR=AH2(a0.r,a1.r); AH2 aG=AH2(a0.g,a1.g); AH2 aB=AH2(a0.b,a1.b); AH2 bR=AH2(b0.r,b1.r); AH2 bG=AH2(b0.g,b1.g); AH2 bB=AH2(b0.b,b1.b); AH2 cR=AH2(c0.r,c1.r); AH2 cG=AH2(c0.g,c1.g); AH2 cB=AH2(c0.b,c1.b); AH2 dR=AH2(d0.r,d1.r); AH2 dG=AH2(d0.g,d1.g); AH2 dB=AH2(d0.b,d1.b); AH2 eR=AH2(e0.r,e1.r); AH2 eG=AH2(e0.g,e1.g); AH2 eB=AH2(e0.b,e1.b); AH2 fR=AH2(f0.r,f1.r); AH2 fG=AH2(f0.g,f1.g); AH2 fB=AH2(f0.b,f1.b); AH2 gR=AH2(g0.r,g1.r); AH2 gG=AH2(g0.g,g1.g); AH2 gB=AH2(g0.b,g1.b); AH2 hR=AH2(h0.r,h1.r); AH2 hG=AH2(h0.g,h1.g); AH2 hB=AH2(h0.b,h1.b); AH2 iR=AH2(i0.r,i1.r); AH2 iG=AH2(i0.g,i1.g); AH2 iB=AH2(i0.b,i1.b); // Run optional input transform. CasInputH(aR,aG,aB); CasInputH(bR,bG,bB); CasInputH(cR,cG,cB); CasInputH(dR,dG,dB); CasInputH(eR,eG,eB); CasInputH(fR,fG,fB); CasInputH(gR,gG,gB); CasInputH(hR,hG,hB); CasInputH(iR,iG,iB); // Soft min and max. AH2 mnR=min(min(fR,hR),min(min(bR,dR),eR)); AH2 mnG=min(min(fG,hG),min(min(bG,dG),eG)); AH2 mnB=min(min(fB,hB),min(min(bB,dB),eB)); #ifdef CAS_BETTER_DIAGONALS AH2 mnR2=min(min(gR,iR),min(min(aR,cR),mnR)); AH2 mnG2=min(min(gG,iG),min(min(aG,cG),mnG)); AH2 mnB2=min(min(gB,iB),min(min(aB,cB),mnB)); mnR=mnR+mnR2; mnG=mnG+mnG2; mnB=mnB+mnB2; #endif AH2 mxR=max(max(fR,hR),max(max(bR,dR),eR)); AH2 mxG=max(max(fG,hG),max(max(bG,dG),eG)); AH2 mxB=max(max(fB,hB),max(max(bB,dB),eB)); #ifdef CAS_BETTER_DIAGONALS AH2 mxR2=max(max(gR,iR),max(max(aR,cR),mxR)); AH2 mxG2=max(max(gG,iG),max(max(aG,cG),mxG)); AH2 mxB2=max(max(gB,iB),max(max(aB,cB),mxB)); mxR=mxR+mxR2; mxG=mxG+mxG2; mxB=mxB+mxB2; #endif // Smooth minimum distance to signal limit divided by smooth max. #ifdef CAS_GO_SLOWER AH2 rcpMR=ARcpH2(mxR); AH2 rcpMG=ARcpH2(mxG); AH2 rcpMB=ARcpH2(mxB); #else AH2 rcpMR=APrxLoRcpH2(mxR); AH2 rcpMG=APrxLoRcpH2(mxG); AH2 rcpMB=APrxLoRcpH2(mxB); #endif #ifdef CAS_BETTER_DIAGONALS AH2 ampR=ASatH2(min(mnR,AH2_(2.0)-mxR)*rcpMR); AH2 ampG=ASatH2(min(mnG,AH2_(2.0)-mxG)*rcpMG); AH2 ampB=ASatH2(min(mnB,AH2_(2.0)-mxB)*rcpMB); #else AH2 ampR=ASatH2(min(mnR,AH2_(1.0)-mxR)*rcpMR); AH2 ampG=ASatH2(min(mnG,AH2_(1.0)-mxG)*rcpMG); AH2 ampB=ASatH2(min(mnB,AH2_(1.0)-mxB)*rcpMB); #endif // Shaping amount of sharpening. #ifdef CAS_GO_SLOWER ampR=sqrt(ampR); ampG=sqrt(ampG); ampB=sqrt(ampB); #else ampR=APrxLoSqrtH2(ampR); ampG=APrxLoSqrtH2(ampG); ampB=APrxLoSqrtH2(ampB); #endif // Filter shape. A_MAYBE_UNUSED AH1 peak=AH2_AU1(const1.y).x; A_MAYBE_UNUSED AH2 wR=ampR*AH2_(peak); A_MAYBE_UNUSED AH2 wG=ampG*AH2_(peak); A_MAYBE_UNUSED AH2 wB=ampB*AH2_(peak); // Filter. #ifndef CAS_SLOW #ifdef CAS_GO_SLOWER AH2 rcpWeight=ARcpH2(AH2_(1.0)+AH2_(4.0)*wG); #else AH2 rcpWeight=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wG); #endif pixR=ASatH2((bR*wG+dR*wG+fR*wG+hR*wG+eR)*rcpWeight); pixG=ASatH2((bG*wG+dG*wG+fG*wG+hG*wG+eG)*rcpWeight); pixB=ASatH2((bB*wG+dB*wG+fB*wG+hB*wG+eB)*rcpWeight); #else #ifdef CAS_GO_SLOWER AH2 rcpWeightR=ARcpH2(AH2_(1.0)+AH2_(4.0)*wR); AH2 rcpWeightG=ARcpH2(AH2_(1.0)+AH2_(4.0)*wG); AH2 rcpWeightB=ARcpH2(AH2_(1.0)+AH2_(4.0)*wB); #else AH2 rcpWeightR=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wR); AH2 rcpWeightG=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wG); AH2 rcpWeightB=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wB); #endif pixR=ASatH2((bR*wR+dR*wR+fR*wR+hR*wR+eR)*rcpWeightR); pixG=ASatH2((bG*wG+dG*wG+fG*wG+hG*wG+eG)*rcpWeightG); pixB=ASatH2((bB*wB+dB*wB+fB*wB+hB*wB+eB)*rcpWeightB); #endif return;} //------------------------------------------------------------------------------------------------------------------------------ // Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm. AF2 pp=AF2(ip)*AF2_AU2(const0.xy)+AF2_AU2(const0.zw); // Tile 0. // Fractional position is needed in high precision here. AF2 fp0=floor(pp); AH2 ppX; ppX.x=AH1(pp.x-fp0.x); AH1 ppY=AH1(pp.y-fp0.y); ASW2 sp0=ASW2(fp0); AH3 a0=CasLoadH(TEXCALL sp0+ASW2(-1,-1)); AH3 b0=CasLoadH(TEXCALL sp0+ASW2( 0,-1)); AH3 e0=CasLoadH(TEXCALL sp0+ASW2(-1, 0)); AH3 f0=CasLoadH(TEXCALL sp0); AH3 c0=CasLoadH(TEXCALL sp0+ASW2( 1,-1)); AH3 d0=CasLoadH(TEXCALL sp0+ASW2( 2,-1)); AH3 g0=CasLoadH(TEXCALL sp0+ASW2( 1, 0)); AH3 h0=CasLoadH(TEXCALL sp0+ASW2( 2, 0)); AH3 i0=CasLoadH(TEXCALL sp0+ASW2(-1, 1)); AH3 j0=CasLoadH(TEXCALL sp0+ASW2( 0, 1)); AH3 m0=CasLoadH(TEXCALL sp0+ASW2(-1, 2)); AH3 n0=CasLoadH(TEXCALL sp0+ASW2( 0, 2)); AH3 k0=CasLoadH(TEXCALL sp0+ASW2( 1, 1)); AH3 l0=CasLoadH(TEXCALL sp0+ASW2( 2, 1)); AH3 o0=CasLoadH(TEXCALL sp0+ASW2( 1, 2)); AH3 p0=CasLoadH(TEXCALL sp0+ASW2( 2, 2)); // Tile 1 (offset only in x). AF1 pp1=pp.x+AF1_AU1(const1.z); AF1 fp1=floor(pp1); ppX.y=AH1(pp1-fp1); ASW2 sp1=ASW2(fp1,sp0.y); AH3 a1=CasLoadH(TEXCALL sp1+ASW2(-1,-1)); AH3 b1=CasLoadH(TEXCALL sp1+ASW2( 0,-1)); AH3 e1=CasLoadH(TEXCALL sp1+ASW2(-1, 0)); AH3 f1=CasLoadH(TEXCALL sp1); AH3 c1=CasLoadH(TEXCALL sp1+ASW2( 1,-1)); AH3 d1=CasLoadH(TEXCALL sp1+ASW2( 2,-1)); AH3 g1=CasLoadH(TEXCALL sp1+ASW2( 1, 0)); AH3 h1=CasLoadH(TEXCALL sp1+ASW2( 2, 0)); AH3 i1=CasLoadH(TEXCALL sp1+ASW2(-1, 1)); AH3 j1=CasLoadH(TEXCALL sp1+ASW2( 0, 1)); AH3 m1=CasLoadH(TEXCALL sp1+ASW2(-1, 2)); AH3 n1=CasLoadH(TEXCALL sp1+ASW2( 0, 2)); AH3 k1=CasLoadH(TEXCALL sp1+ASW2( 1, 1)); AH3 l1=CasLoadH(TEXCALL sp1+ASW2( 2, 1)); AH3 o1=CasLoadH(TEXCALL sp1+ASW2( 1, 2)); AH3 p1=CasLoadH(TEXCALL sp1+ASW2( 2, 2)); // AOS to SOA conversion. AH2 aR=AH2(a0.r,a1.r); AH2 aG=AH2(a0.g,a1.g); AH2 aB=AH2(a0.b,a1.b); AH2 bR=AH2(b0.r,b1.r); AH2 bG=AH2(b0.g,b1.g); AH2 bB=AH2(b0.b,b1.b); AH2 cR=AH2(c0.r,c1.r); AH2 cG=AH2(c0.g,c1.g); AH2 cB=AH2(c0.b,c1.b); AH2 dR=AH2(d0.r,d1.r); AH2 dG=AH2(d0.g,d1.g); AH2 dB=AH2(d0.b,d1.b); AH2 eR=AH2(e0.r,e1.r); AH2 eG=AH2(e0.g,e1.g); AH2 eB=AH2(e0.b,e1.b); AH2 fR=AH2(f0.r,f1.r); AH2 fG=AH2(f0.g,f1.g); AH2 fB=AH2(f0.b,f1.b); AH2 gR=AH2(g0.r,g1.r); AH2 gG=AH2(g0.g,g1.g); AH2 gB=AH2(g0.b,g1.b); AH2 hR=AH2(h0.r,h1.r); AH2 hG=AH2(h0.g,h1.g); AH2 hB=AH2(h0.b,h1.b); AH2 iR=AH2(i0.r,i1.r); AH2 iG=AH2(i0.g,i1.g); AH2 iB=AH2(i0.b,i1.b); AH2 jR=AH2(j0.r,j1.r); AH2 jG=AH2(j0.g,j1.g); AH2 jB=AH2(j0.b,j1.b); AH2 kR=AH2(k0.r,k1.r); AH2 kG=AH2(k0.g,k1.g); AH2 kB=AH2(k0.b,k1.b); AH2 lR=AH2(l0.r,l1.r); AH2 lG=AH2(l0.g,l1.g); AH2 lB=AH2(l0.b,l1.b); AH2 mR=AH2(m0.r,m1.r); AH2 mG=AH2(m0.g,m1.g); AH2 mB=AH2(m0.b,m1.b); AH2 nR=AH2(n0.r,n1.r); AH2 nG=AH2(n0.g,n1.g); AH2 nB=AH2(n0.b,n1.b); AH2 oR=AH2(o0.r,o1.r); AH2 oG=AH2(o0.g,o1.g); AH2 oB=AH2(o0.b,o1.b); AH2 pR=AH2(p0.r,p1.r); AH2 pG=AH2(p0.g,p1.g); AH2 pB=AH2(p0.b,p1.b); // Run optional input transform. CasInputH(aR,aG,aB); CasInputH(bR,bG,bB); CasInputH(cR,cG,cB); CasInputH(dR,dG,dB); CasInputH(eR,eG,eB); CasInputH(fR,fG,fB); CasInputH(gR,gG,gB); CasInputH(hR,hG,hB); CasInputH(iR,iG,iB); CasInputH(jR,jG,jB); CasInputH(kR,kG,kB); CasInputH(lR,lG,lB); CasInputH(mR,mG,mB); CasInputH(nR,nG,nB); CasInputH(oR,oG,oB); CasInputH(pR,pG,pB); // Soft min and max. // These are 2.0x bigger (factored out the extra multiply). // a b c b // e f g * 0.5 + e f g * 0.5 [F] // i j k j AH2 mnfR=AMin3H2(AMin3H2(bR,eR,fR),gR,jR); AH2 mnfG=AMin3H2(AMin3H2(bG,eG,fG),gG,jG); AH2 mnfB=AMin3H2(AMin3H2(bB,eB,fB),gB,jB); #ifdef CAS_BETTER_DIAGONALS AH2 mnfR2=AMin3H2(AMin3H2(mnfR,aR,cR),iR,kR); AH2 mnfG2=AMin3H2(AMin3H2(mnfG,aG,cG),iG,kG); AH2 mnfB2=AMin3H2(AMin3H2(mnfB,aB,cB),iB,kB); mnfR=mnfR+mnfR2; mnfG=mnfG+mnfG2; mnfB=mnfB+mnfB2; #endif AH2 mxfR=AMax3H2(AMax3H2(bR,eR,fR),gR,jR); AH2 mxfG=AMax3H2(AMax3H2(bG,eG,fG),gG,jG); AH2 mxfB=AMax3H2(AMax3H2(bB,eB,fB),gB,jB); #ifdef CAS_BETTER_DIAGONALS AH2 mxfR2=AMax3H2(AMax3H2(mxfR,aR,cR),iR,kR); AH2 mxfG2=AMax3H2(AMax3H2(mxfG,aG,cG),iG,kG); AH2 mxfB2=AMax3H2(AMax3H2(mxfB,aB,cB),iB,kB); mxfR=mxfR+mxfR2; mxfG=mxfG+mxfG2; mxfB=mxfB+mxfB2; #endif // b c d c // f g h * 0.5 + f g h * 0.5 [G] // j k l k AH2 mngR=AMin3H2(AMin3H2(cR,fR,gR),hR,kR); AH2 mngG=AMin3H2(AMin3H2(cG,fG,gG),hG,kG); AH2 mngB=AMin3H2(AMin3H2(cB,fB,gB),hB,kB); #ifdef CAS_BETTER_DIAGONALS AH2 mngR2=AMin3H2(AMin3H2(mngR,bR,dR),jR,lR); AH2 mngG2=AMin3H2(AMin3H2(mngG,bG,dG),jG,lG); AH2 mngB2=AMin3H2(AMin3H2(mngB,bB,dB),jB,lB); mngR=mngR+mngR2; mngG=mngG+mngG2; mngB=mngB+mngB2; #endif AH2 mxgR=AMax3H2(AMax3H2(cR,fR,gR),hR,kR); AH2 mxgG=AMax3H2(AMax3H2(cG,fG,gG),hG,kG); AH2 mxgB=AMax3H2(AMax3H2(cB,fB,gB),hB,kB); #ifdef CAS_BETTER_DIAGONALS AH2 mxgR2=AMax3H2(AMax3H2(mxgR,bR,dR),jR,lR); AH2 mxgG2=AMax3H2(AMax3H2(mxgG,bG,dG),jG,lG); AH2 mxgB2=AMax3H2(AMax3H2(mxgB,bB,dB),jB,lB); mxgR=mxgR+mxgR2; mxgG=mxgG+mxgG2; mxgB=mxgB+mxgB2; #endif // e f g f // i j k * 0.5 + i j k * 0.5 [J] // m n o n AH2 mnjR=AMin3H2(AMin3H2(fR,iR,jR),kR,nR); AH2 mnjG=AMin3H2(AMin3H2(fG,iG,jG),kG,nG); AH2 mnjB=AMin3H2(AMin3H2(fB,iB,jB),kB,nB); #ifdef CAS_BETTER_DIAGONALS AH2 mnjR2=AMin3H2(AMin3H2(mnjR,eR,gR),mR,oR); AH2 mnjG2=AMin3H2(AMin3H2(mnjG,eG,gG),mG,oG); AH2 mnjB2=AMin3H2(AMin3H2(mnjB,eB,gB),mB,oB); mnjR=mnjR+mnjR2; mnjG=mnjG+mnjG2; mnjB=mnjB+mnjB2; #endif AH2 mxjR=AMax3H2(AMax3H2(fR,iR,jR),kR,nR); AH2 mxjG=AMax3H2(AMax3H2(fG,iG,jG),kG,nG); AH2 mxjB=AMax3H2(AMax3H2(fB,iB,jB),kB,nB); #ifdef CAS_BETTER_DIAGONALS AH2 mxjR2=AMax3H2(AMax3H2(mxjR,eR,gR),mR,oR); AH2 mxjG2=AMax3H2(AMax3H2(mxjG,eG,gG),mG,oG); AH2 mxjB2=AMax3H2(AMax3H2(mxjB,eB,gB),mB,oB); mxjR=mxjR+mxjR2; mxjG=mxjG+mxjG2; mxjB=mxjB+mxjB2; #endif // f g h g // j k l * 0.5 + j k l * 0.5 [K] // n o p o AH2 mnkR=AMin3H2(AMin3H2(gR,jR,kR),lR,oR); AH2 mnkG=AMin3H2(AMin3H2(gG,jG,kG),lG,oG); AH2 mnkB=AMin3H2(AMin3H2(gB,jB,kB),lB,oB); #ifdef CAS_BETTER_DIAGONALS AH2 mnkR2=AMin3H2(AMin3H2(mnkR,fR,hR),nR,pR); AH2 mnkG2=AMin3H2(AMin3H2(mnkG,fG,hG),nG,pG); AH2 mnkB2=AMin3H2(AMin3H2(mnkB,fB,hB),nB,pB); mnkR=mnkR+mnkR2; mnkG=mnkG+mnkG2; mnkB=mnkB+mnkB2; #endif AH2 mxkR=AMax3H2(AMax3H2(gR,jR,kR),lR,oR); AH2 mxkG=AMax3H2(AMax3H2(gG,jG,kG),lG,oG); AH2 mxkB=AMax3H2(AMax3H2(gB,jB,kB),lB,oB); #ifdef CAS_BETTER_DIAGONALS AH2 mxkR2=AMax3H2(AMax3H2(mxkR,fR,hR),nR,pR); AH2 mxkG2=AMax3H2(AMax3H2(mxkG,fG,hG),nG,pG); AH2 mxkB2=AMax3H2(AMax3H2(mxkB,fB,hB),nB,pB); mxkR=mxkR+mxkR2; mxkG=mxkG+mxkG2; mxkB=mxkB+mxkB2; #endif // Smooth minimum distance to signal limit divided by smooth max. #ifdef CAS_GO_SLOWER AH2 rcpMfR=ARcpH2(mxfR); AH2 rcpMfG=ARcpH2(mxfG); AH2 rcpMfB=ARcpH2(mxfB); AH2 rcpMgR=ARcpH2(mxgR); AH2 rcpMgG=ARcpH2(mxgG); AH2 rcpMgB=ARcpH2(mxgB); AH2 rcpMjR=ARcpH2(mxjR); AH2 rcpMjG=ARcpH2(mxjG); AH2 rcpMjB=ARcpH2(mxjB); AH2 rcpMkR=ARcpH2(mxkR); AH2 rcpMkG=ARcpH2(mxkG); AH2 rcpMkB=ARcpH2(mxkB); #else AH2 rcpMfR=APrxLoRcpH2(mxfR); AH2 rcpMfG=APrxLoRcpH2(mxfG); AH2 rcpMfB=APrxLoRcpH2(mxfB); AH2 rcpMgR=APrxLoRcpH2(mxgR); AH2 rcpMgG=APrxLoRcpH2(mxgG); AH2 rcpMgB=APrxLoRcpH2(mxgB); AH2 rcpMjR=APrxLoRcpH2(mxjR); AH2 rcpMjG=APrxLoRcpH2(mxjG); AH2 rcpMjB=APrxLoRcpH2(mxjB); AH2 rcpMkR=APrxLoRcpH2(mxkR); AH2 rcpMkG=APrxLoRcpH2(mxkG); AH2 rcpMkB=APrxLoRcpH2(mxkB); #endif #ifdef CAS_BETTER_DIAGONALS AH2 ampfR=ASatH2(min(mnfR,AH2_(2.0)-mxfR)*rcpMfR); AH2 ampfG=ASatH2(min(mnfG,AH2_(2.0)-mxfG)*rcpMfG); AH2 ampfB=ASatH2(min(mnfB,AH2_(2.0)-mxfB)*rcpMfB); AH2 ampgR=ASatH2(min(mngR,AH2_(2.0)-mxgR)*rcpMgR); AH2 ampgG=ASatH2(min(mngG,AH2_(2.0)-mxgG)*rcpMgG); AH2 ampgB=ASatH2(min(mngB,AH2_(2.0)-mxgB)*rcpMgB); AH2 ampjR=ASatH2(min(mnjR,AH2_(2.0)-mxjR)*rcpMjR); AH2 ampjG=ASatH2(min(mnjG,AH2_(2.0)-mxjG)*rcpMjG); AH2 ampjB=ASatH2(min(mnjB,AH2_(2.0)-mxjB)*rcpMjB); AH2 ampkR=ASatH2(min(mnkR,AH2_(2.0)-mxkR)*rcpMkR); AH2 ampkG=ASatH2(min(mnkG,AH2_(2.0)-mxkG)*rcpMkG); AH2 ampkB=ASatH2(min(mnkB,AH2_(2.0)-mxkB)*rcpMkB); #else AH2 ampfR=ASatH2(min(mnfR,AH2_(1.0)-mxfR)*rcpMfR); AH2 ampfG=ASatH2(min(mnfG,AH2_(1.0)-mxfG)*rcpMfG); AH2 ampfB=ASatH2(min(mnfB,AH2_(1.0)-mxfB)*rcpMfB); AH2 ampgR=ASatH2(min(mngR,AH2_(1.0)-mxgR)*rcpMgR); AH2 ampgG=ASatH2(min(mngG,AH2_(1.0)-mxgG)*rcpMgG); AH2 ampgB=ASatH2(min(mngB,AH2_(1.0)-mxgB)*rcpMgB); AH2 ampjR=ASatH2(min(mnjR,AH2_(1.0)-mxjR)*rcpMjR); AH2 ampjG=ASatH2(min(mnjG,AH2_(1.0)-mxjG)*rcpMjG); AH2 ampjB=ASatH2(min(mnjB,AH2_(1.0)-mxjB)*rcpMjB); AH2 ampkR=ASatH2(min(mnkR,AH2_(1.0)-mxkR)*rcpMkR); AH2 ampkG=ASatH2(min(mnkG,AH2_(1.0)-mxkG)*rcpMkG); AH2 ampkB=ASatH2(min(mnkB,AH2_(1.0)-mxkB)*rcpMkB); #endif // Shaping amount of sharpening. #ifdef CAS_GO_SLOWER ampfR=sqrt(ampfR); ampfG=sqrt(ampfG); ampfB=sqrt(ampfB); ampgR=sqrt(ampgR); ampgG=sqrt(ampgG); ampgB=sqrt(ampgB); ampjR=sqrt(ampjR); ampjG=sqrt(ampjG); ampjB=sqrt(ampjB); ampkR=sqrt(ampkR); ampkG=sqrt(ampkG); ampkB=sqrt(ampkB); #else ampfR=APrxLoSqrtH2(ampfR); ampfG=APrxLoSqrtH2(ampfG); ampfB=APrxLoSqrtH2(ampfB); ampgR=APrxLoSqrtH2(ampgR); ampgG=APrxLoSqrtH2(ampgG); ampgB=APrxLoSqrtH2(ampgB); ampjR=APrxLoSqrtH2(ampjR); ampjG=APrxLoSqrtH2(ampjG); ampjB=APrxLoSqrtH2(ampjB); ampkR=APrxLoSqrtH2(ampkR); ampkG=APrxLoSqrtH2(ampkG); ampkB=APrxLoSqrtH2(ampkB); #endif // Filter shape. AH1 peak=AH2_AU1(const1.y).x; AH2 wfR=ampfR*AH2_(peak); AH2 wfG=ampfG*AH2_(peak); AH2 wfB=ampfB*AH2_(peak); AH2 wgR=ampgR*AH2_(peak); AH2 wgG=ampgG*AH2_(peak); AH2 wgB=ampgB*AH2_(peak); AH2 wjR=ampjR*AH2_(peak); AH2 wjG=ampjG*AH2_(peak); AH2 wjB=ampjB*AH2_(peak); AH2 wkR=ampkR*AH2_(peak); AH2 wkG=ampkG*AH2_(peak); AH2 wkB=ampkB*AH2_(peak); // Blend between 4 results. AH2 s=(AH2_(1.0)-ppX)*(AH2_(1.0)-AH2_(ppY)); AH2 t= ppX *(AH2_(1.0)-AH2_(ppY)); AH2 u=(AH2_(1.0)-ppX)* AH2_(ppY) ; AH2 v= ppX * AH2_(ppY) ; // Thin edges to hide bilinear interpolation (helps diagonals). AH2 thinB=AH2_(1.0/32.0); #ifdef CAS_GO_SLOWER s*=ARcpH2(thinB+(mxfG-mnfG)); t*=ARcpH2(thinB+(mxgG-mngG)); u*=ARcpH2(thinB+(mxjG-mnjG)); v*=ARcpH2(thinB+(mxkG-mnkG)); #else s*=APrxLoRcpH2(thinB+(mxfG-mnfG)); t*=APrxLoRcpH2(thinB+(mxgG-mngG)); u*=APrxLoRcpH2(thinB+(mxjG-mnjG)); v*=APrxLoRcpH2(thinB+(mxkG-mnkG)); #endif // Final weighting. A_MAYBE_UNUSED AH2 qbeR=wfR*s; A_MAYBE_UNUSED AH2 qbeG=wfG*s; A_MAYBE_UNUSED AH2 qbeB=wfB*s; A_MAYBE_UNUSED AH2 qchR=wgR*t; A_MAYBE_UNUSED AH2 qchG=wgG*t; A_MAYBE_UNUSED AH2 qchB=wgB*t; A_MAYBE_UNUSED AH2 qfR=wgR*t+wjR*u+s; A_MAYBE_UNUSED AH2 qfG=wgG*t+wjG*u+s; A_MAYBE_UNUSED AH2 qfB=wgB*t+wjB*u+s; A_MAYBE_UNUSED AH2 qgR=wfR*s+wkR*v+t; A_MAYBE_UNUSED AH2 qgG=wfG*s+wkG*v+t; A_MAYBE_UNUSED AH2 qgB=wfB*s+wkB*v+t; A_MAYBE_UNUSED AH2 qjR=wfR*s+wkR*v+u; A_MAYBE_UNUSED AH2 qjG=wfG*s+wkG*v+u; A_MAYBE_UNUSED AH2 qjB=wfB*s+wkB*v+u; A_MAYBE_UNUSED AH2 qkR=wgR*t+wjR*u+v; A_MAYBE_UNUSED AH2 qkG=wgG*t+wjG*u+v; A_MAYBE_UNUSED AH2 qkB=wgB*t+wjB*u+v; A_MAYBE_UNUSED AH2 qinR=wjR*u; A_MAYBE_UNUSED AH2 qinG=wjG*u; A_MAYBE_UNUSED AH2 qinB=wjB*u; A_MAYBE_UNUSED AH2 qloR=wkR*v; A_MAYBE_UNUSED AH2 qloG=wkG*v; A_MAYBE_UNUSED AH2 qloB=wkB*v; // Filter. #ifndef CAS_SLOW #ifdef CAS_GO_SLOWER AH2 rcpWG=ARcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG); #else AH2 rcpWG=APrxMedRcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG); #endif pixR=ASatH2((bR*qbeG+eR*qbeG+cR*qchG+hR*qchG+iR*qinG+nR*qinG+lR*qloG+oR*qloG+fR*qfG+gR*qgG+jR*qjG+kR*qkG)*rcpWG); pixG=ASatH2((bG*qbeG+eG*qbeG+cG*qchG+hG*qchG+iG*qinG+nG*qinG+lG*qloG+oG*qloG+fG*qfG+gG*qgG+jG*qjG+kG*qkG)*rcpWG); pixB=ASatH2((bB*qbeG+eB*qbeG+cB*qchG+hB*qchG+iB*qinG+nB*qinG+lB*qloG+oB*qloG+fB*qfG+gB*qgG+jB*qjG+kB*qkG)*rcpWG); #else #ifdef CAS_GO_SLOWER AH2 rcpWR=ARcpH2(AH2_(2.0)*qbeR+AH2_(2.0)*qchR+AH2_(2.0)*qinR+AH2_(2.0)*qloR+qfR+qgR+qjR+qkR); AH2 rcpWG=ARcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG); AH2 rcpWB=ARcpH2(AH2_(2.0)*qbeB+AH2_(2.0)*qchB+AH2_(2.0)*qinB+AH2_(2.0)*qloB+qfB+qgB+qjB+qkB); #else AH2 rcpWR=APrxMedRcpH2(AH2_(2.0)*qbeR+AH2_(2.0)*qchR+AH2_(2.0)*qinR+AH2_(2.0)*qloR+qfR+qgR+qjR+qkR); AH2 rcpWG=APrxMedRcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG); AH2 rcpWB=APrxMedRcpH2(AH2_(2.0)*qbeB+AH2_(2.0)*qchB+AH2_(2.0)*qinB+AH2_(2.0)*qloB+qfB+qgB+qjB+qkB); #endif pixR=ASatH2((bR*qbeR+eR*qbeR+cR*qchR+hR*qchR+iR*qinR+nR*qinR+lR*qloR+oR*qloR+fR*qfR+gR*qgR+jR*qjR+kR*qkR)*rcpWR); pixG=ASatH2((bG*qbeG+eG*qbeG+cG*qchG+hG*qchG+iG*qinG+nG*qinG+lG*qloG+oG*qloG+fG*qfG+gG*qgG+jG*qjG+kG*qkG)*rcpWG); pixB=ASatH2((bB*qbeB+eB*qbeB+cB*qchB+hB*qchB+iB*qinB+nB*qinB+lB*qloB+oB*qloB+fB*qfB+gB*qgB+jB*qjB+kB*qkB)*rcpWB); #endif } #undef TEXINPUT #undef TEXCALL #endif