Resources: ffx_cas.h Metal support

This commit is contained in:
TellowKrinkle 2022-11-20 17:04:05 -06:00 committed by refractionpcsx2
parent cef8d03d49
commit 2dfb819d35
1 changed files with 172 additions and 141 deletions

View File

@ -96,6 +96,8 @@
// // The ASU2() typecast back to 32-bit is a NO-OP, the compiler pattern matches and uses A16 opcode support instead. // // The ASU2() typecast back to 32-bit is a NO-OP, the compiler pattern matches and uses A16 opcode support instead.
// // The AH3() typecast to 16-bit float is a NO-OP, the compiler pattern matches and uses D16 opcode support instead. // // The AH3() typecast to 16-bit float is a NO-OP, the compiler pattern matches and uses D16 opcode support instead.
// AH3 CasLoadH(ASW2 p){return AH3(imageLoad(imgSrc,ASU2(p)).rgb);} // AH3 CasLoadH(ASW2 p){return AH3(imageLoad(imgSrc,ASU2(p)).rgb);}
// // If you define CAS_TEXTURE and/or CAS_TEXTUREH to a type, a value of that type will be added as the first input to CasFilter and forwarded to CasLoad
// // This is useful for forwarding extra data to the load functions, and is required by MSL, which doesn't use global textures
// ... // ...
// // Define the input modifiers as nop's initially. // // Define the input modifiers as nop's initially.
// // See "INPUT FORMAT SPECIFIC CASES" below for specifics on what to place in these functions. // // See "INPUT FORMAT SPECIFIC CASES" below for specifics on what to place in these functions.
@ -399,16 +401,28 @@ A_STATIC void CasSetup(
// NON-PACKED VERSION // NON-PACKED VERSION
//============================================================================================================================== //==============================================================================================================================
#ifdef A_GPU #ifdef A_GPU
#if defined(A_MSL) && !defined(CAS_TEXTURE)
#define CAS_TEXTURE texture2d<float>
#endif
#ifdef CAS_TEXTURE
#define TEXCALL tex,
#define TEXINPUT CAS_TEXTURE tex,
#else
#define TEXCALL
#define TEXINPUT
#endif
#ifdef CAS_PACKED_ONLY #ifdef CAS_PACKED_ONLY
// Avoid compiler error. // Avoid compiler error.
AF3 CasLoad(ASU2 p){return AF3(0.0,0.0,0.0);} A_STATIC AF3 CasLoad(ASU2 p){return AF3(0.0,0.0,0.0);}
void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){} // MSL Doesn't let you inout vector elements, so use a macro
#define CasInput(r,g,b)
#endif #endif
//------------------------------------------------------------------------------------------------------------------------------ //------------------------------------------------------------------------------------------------------------------------------
void CasFilter( A_STATIC void CasFilter(
out AF1 pixR, // Output values, non-vector so port between CasFilter() and CasFilterH() is easy. TEXINPUT
out AF1 pixG, outAF1 pixR, // Output values, non-vector so port between CasFilter() and CasFilterH() is easy.
out AF1 pixB, outAF1 pixG,
outAF1 pixB,
AU2 ip, // Integer pixel position in output. AU2 ip, // Integer pixel position in output.
AU4 const0, // Constants generated by CasSetup(). AU4 const0, // Constants generated by CasSetup().
AU4 const1, AU4 const1,
@ -426,15 +440,15 @@ A_STATIC void CasSetup(
// d e f // d e f
// g h i // g h i
ASU2 sp=ASU2(ip); ASU2 sp=ASU2(ip);
AF3 a=CasLoad(sp+ASU2(-1,-1)); A_MAYBE_UNUSED AF3 a=CasLoad(TEXCALL sp+ASU2(-1,-1));
AF3 b=CasLoad(sp+ASU2( 0,-1)); A_MAYBE_UNUSED AF3 b=CasLoad(TEXCALL sp+ASU2( 0,-1));
AF3 c=CasLoad(sp+ASU2( 1,-1)); A_MAYBE_UNUSED AF3 c=CasLoad(TEXCALL sp+ASU2( 1,-1));
AF3 d=CasLoad(sp+ASU2(-1, 0)); A_MAYBE_UNUSED AF3 d=CasLoad(TEXCALL sp+ASU2(-1, 0));
AF3 e=CasLoad(sp); A_MAYBE_UNUSED AF3 e=CasLoad(TEXCALL sp);
AF3 f=CasLoad(sp+ASU2( 1, 0)); A_MAYBE_UNUSED AF3 f=CasLoad(TEXCALL sp+ASU2( 1, 0));
AF3 g=CasLoad(sp+ASU2(-1, 1)); A_MAYBE_UNUSED AF3 g=CasLoad(TEXCALL sp+ASU2(-1, 1));
AF3 h=CasLoad(sp+ASU2( 0, 1)); A_MAYBE_UNUSED AF3 h=CasLoad(TEXCALL sp+ASU2( 0, 1));
AF3 i=CasLoad(sp+ASU2( 1, 1)); A_MAYBE_UNUSED AF3 i=CasLoad(TEXCALL sp+ASU2( 1, 1));
// Run optional input transform. // Run optional input transform.
CasInput(a.r,a.g,a.b); CasInput(a.r,a.g,a.b);
CasInput(b.r,b.g,b.b); CasInput(b.r,b.g,b.b);
@ -505,10 +519,10 @@ A_STATIC void CasSetup(
// 0 w 0 // 0 w 0
// w 1 w // w 1 w
// 0 w 0 // 0 w 0
AF1 peak=AF1_AU1(const1.x); A_MAYBE_UNUSED AF1 peak=AF1_AU1(const1.x);
AF1 wR=ampR*peak; A_MAYBE_UNUSED AF1 wR=ampR*peak;
AF1 wG=ampG*peak; A_MAYBE_UNUSED AF1 wG=ampG*peak;
AF1 wB=ampB*peak; A_MAYBE_UNUSED AF1 wB=ampB*peak;
// Filter. // Filter.
#ifndef CAS_SLOW #ifndef CAS_SLOW
// Using green coef only, depending on dead code removal to strip out the extra overhead. // Using green coef only, depending on dead code removal to strip out the extra overhead.
@ -555,22 +569,22 @@ A_STATIC void CasSetup(
AF2 fp=floor(pp); AF2 fp=floor(pp);
pp-=fp; pp-=fp;
ASU2 sp=ASU2(fp); ASU2 sp=ASU2(fp);
AF3 a=CasLoad(sp+ASU2(-1,-1)); A_MAYBE_UNUSED AF3 a=CasLoad(TEXCALL sp+ASU2(-1,-1));
AF3 b=CasLoad(sp+ASU2( 0,-1)); A_MAYBE_UNUSED AF3 b=CasLoad(TEXCALL sp+ASU2( 0,-1));
AF3 e=CasLoad(sp+ASU2(-1, 0)); A_MAYBE_UNUSED AF3 e=CasLoad(TEXCALL sp+ASU2(-1, 0));
AF3 f=CasLoad(sp); A_MAYBE_UNUSED AF3 f=CasLoad(TEXCALL sp);
AF3 c=CasLoad(sp+ASU2( 1,-1)); A_MAYBE_UNUSED AF3 c=CasLoad(TEXCALL sp+ASU2( 1,-1));
AF3 d=CasLoad(sp+ASU2( 2,-1)); A_MAYBE_UNUSED AF3 d=CasLoad(TEXCALL sp+ASU2( 2,-1));
AF3 g=CasLoad(sp+ASU2( 1, 0)); A_MAYBE_UNUSED AF3 g=CasLoad(TEXCALL sp+ASU2( 1, 0));
AF3 h=CasLoad(sp+ASU2( 2, 0)); A_MAYBE_UNUSED AF3 h=CasLoad(TEXCALL sp+ASU2( 2, 0));
AF3 i=CasLoad(sp+ASU2(-1, 1)); A_MAYBE_UNUSED AF3 i=CasLoad(TEXCALL sp+ASU2(-1, 1));
AF3 j=CasLoad(sp+ASU2( 0, 1)); A_MAYBE_UNUSED AF3 j=CasLoad(TEXCALL sp+ASU2( 0, 1));
AF3 m=CasLoad(sp+ASU2(-1, 2)); A_MAYBE_UNUSED AF3 m=CasLoad(TEXCALL sp+ASU2(-1, 2));
AF3 n=CasLoad(sp+ASU2( 0, 2)); A_MAYBE_UNUSED AF3 n=CasLoad(TEXCALL sp+ASU2( 0, 2));
AF3 k=CasLoad(sp+ASU2( 1, 1)); A_MAYBE_UNUSED AF3 k=CasLoad(TEXCALL sp+ASU2( 1, 1));
AF3 l=CasLoad(sp+ASU2( 2, 1)); A_MAYBE_UNUSED AF3 l=CasLoad(TEXCALL sp+ASU2( 2, 1));
AF3 o=CasLoad(sp+ASU2( 1, 2)); A_MAYBE_UNUSED AF3 o=CasLoad(TEXCALL sp+ASU2( 1, 2));
AF3 p=CasLoad(sp+ASU2( 2, 2)); A_MAYBE_UNUSED AF3 p=CasLoad(TEXCALL sp+ASU2( 2, 2));
// Run optional input transform. // Run optional input transform.
CasInput(a.r,a.g,a.b); CasInput(a.r,a.g,a.b);
CasInput(b.r,b.g,b.b); CasInput(b.r,b.g,b.b);
@ -827,30 +841,30 @@ A_STATIC void CasSetup(
// _____ _____ _____ _____ // _____ _____ _____ _____
// //
// ju kv // ju kv
AF1 qbeR=wfR*s; A_MAYBE_UNUSED AF1 qbeR=wfR*s;
AF1 qbeG=wfG*s; A_MAYBE_UNUSED AF1 qbeG=wfG*s;
AF1 qbeB=wfB*s; A_MAYBE_UNUSED AF1 qbeB=wfB*s;
AF1 qchR=wgR*t; A_MAYBE_UNUSED AF1 qchR=wgR*t;
AF1 qchG=wgG*t; A_MAYBE_UNUSED AF1 qchG=wgG*t;
AF1 qchB=wgB*t; A_MAYBE_UNUSED AF1 qchB=wgB*t;
AF1 qfR=wgR*t+wjR*u+s; A_MAYBE_UNUSED AF1 qfR=wgR*t+wjR*u+s;
AF1 qfG=wgG*t+wjG*u+s; A_MAYBE_UNUSED AF1 qfG=wgG*t+wjG*u+s;
AF1 qfB=wgB*t+wjB*u+s; A_MAYBE_UNUSED AF1 qfB=wgB*t+wjB*u+s;
AF1 qgR=wfR*s+wkR*v+t; A_MAYBE_UNUSED AF1 qgR=wfR*s+wkR*v+t;
AF1 qgG=wfG*s+wkG*v+t; A_MAYBE_UNUSED AF1 qgG=wfG*s+wkG*v+t;
AF1 qgB=wfB*s+wkB*v+t; A_MAYBE_UNUSED AF1 qgB=wfB*s+wkB*v+t;
AF1 qjR=wfR*s+wkR*v+u; A_MAYBE_UNUSED AF1 qjR=wfR*s+wkR*v+u;
AF1 qjG=wfG*s+wkG*v+u; A_MAYBE_UNUSED AF1 qjG=wfG*s+wkG*v+u;
AF1 qjB=wfB*s+wkB*v+u; A_MAYBE_UNUSED AF1 qjB=wfB*s+wkB*v+u;
AF1 qkR=wgR*t+wjR*u+v; A_MAYBE_UNUSED AF1 qkR=wgR*t+wjR*u+v;
AF1 qkG=wgG*t+wjG*u+v; A_MAYBE_UNUSED AF1 qkG=wgG*t+wjG*u+v;
AF1 qkB=wgB*t+wjB*u+v; A_MAYBE_UNUSED AF1 qkB=wgB*t+wjB*u+v;
AF1 qinR=wjR*u; A_MAYBE_UNUSED AF1 qinR=wjR*u;
AF1 qinG=wjG*u; A_MAYBE_UNUSED AF1 qinG=wjG*u;
AF1 qinB=wjB*u; A_MAYBE_UNUSED AF1 qinB=wjB*u;
AF1 qloR=wkR*v; A_MAYBE_UNUSED AF1 qloR=wkR*v;
AF1 qloG=wkG*v; A_MAYBE_UNUSED AF1 qloG=wkG*v;
AF1 qloB=wkB*v; A_MAYBE_UNUSED AF1 qloB=wkB*v;
// Filter. // Filter.
#ifndef CAS_SLOW #ifndef CAS_SLOW
// Using green coef only, depending on dead code removal to strip out the extra overhead. // Using green coef only, depending on dead code removal to strip out the extra overhead.
@ -877,6 +891,9 @@ A_STATIC void CasSetup(
pixB=ASatF1((b.b*qbeB+e.b*qbeB+c.b*qchB+h.b*qchB+i.b*qinB+n.b*qinB+l.b*qloB+o.b*qloB+f.b*qfB+g.b*qgB+j.b*qjB+k.b*qkB)*rcpWB); pixB=ASatF1((b.b*qbeB+e.b*qbeB+c.b*qchB+h.b*qchB+i.b*qinB+n.b*qinB+l.b*qloB+o.b*qloB+f.b*qfB+g.b*qgB+j.b*qjB+k.b*qkB)*rcpWB);
#endif #endif
} }
#undef TEXINPUT
#undef TEXCALL
#endif #endif
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -885,6 +902,16 @@ A_STATIC void CasSetup(
// PACKED VERSION // PACKED VERSION
//============================================================================================================================== //==============================================================================================================================
#if defined(A_GPU) && defined(A_HALF) #if defined(A_GPU) && defined(A_HALF)
#if defined(A_MSL) && !defined(CAS_TEXTUREH)
#define CAS_TEXTUREH texture2d<half>
#endif
#ifdef CAS_TEXTUREH
#define TEXCALL tex,
#define TEXINPUT CAS_TEXTUREH tex,
#else
#define TEXCALL
#define TEXINPUT
#endif
// Missing a way to do packed re-interpetation, so must disable approximation optimizations. // Missing a way to do packed re-interpetation, so must disable approximation optimizations.
#ifdef A_HLSL #ifdef A_HLSL
#ifndef CAS_GO_SLOWER #ifndef CAS_GO_SLOWER
@ -893,7 +920,7 @@ A_STATIC void CasSetup(
#endif #endif
//============================================================================================================================== //==============================================================================================================================
// Can be used to convert from packed SOA to AOS for store. // Can be used to convert from packed SOA to AOS for store.
void CasDepack(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){ void CasDepack(outAH4 pix0,outAH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
#ifdef A_HLSL #ifdef A_HLSL
// Invoke a slower path for DX only, since it won't allow uninitialized values. // Invoke a slower path for DX only, since it won't allow uninitialized values.
pix0.a=pix1.a=0.0; pix0.a=pix1.a=0.0;
@ -902,13 +929,14 @@ A_STATIC void CasSetup(
pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);} pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
//============================================================================================================================== //==============================================================================================================================
void CasFilterH( void CasFilterH(
TEXINPUT
// Output values are for 2 8x8 tiles in a 16x8 region. // Output values are for 2 8x8 tiles in a 16x8 region.
// pix<R,G,B>.x = right 8x8 tile // pix<R,G,B>.x = right 8x8 tile
// pix<R,G,B>.y = left 8x8 tile // pix<R,G,B>.y = left 8x8 tile
// This enables later processing to easily be packed as well. // This enables later processing to easily be packed as well.
out AH2 pixR, outAH2 pixR,
out AH2 pixG, outAH2 pixG,
out AH2 pixB, outAH2 pixB,
AU2 ip, // Integer pixel position in output. AU2 ip, // Integer pixel position in output.
AU4 const0, // Constants generated by CasSetup(). AU4 const0, // Constants generated by CasSetup().
AU4 const1, AU4 const1,
@ -923,25 +951,25 @@ A_STATIC void CasSetup(
// No scaling algorithm uses minimal 3x3 pixel neighborhood. // No scaling algorithm uses minimal 3x3 pixel neighborhood.
if(noScaling){ if(noScaling){
ASW2 sp0=ASW2(ip); ASW2 sp0=ASW2(ip);
AH3 a0=CasLoadH(sp0+ASW2(-1,-1)); AH3 a0=CasLoadH(TEXCALL sp0+ASW2(-1,-1));
AH3 b0=CasLoadH(sp0+ASW2( 0,-1)); AH3 b0=CasLoadH(TEXCALL sp0+ASW2( 0,-1));
AH3 c0=CasLoadH(sp0+ASW2( 1,-1)); AH3 c0=CasLoadH(TEXCALL sp0+ASW2( 1,-1));
AH3 d0=CasLoadH(sp0+ASW2(-1, 0)); AH3 d0=CasLoadH(TEXCALL sp0+ASW2(-1, 0));
AH3 e0=CasLoadH(sp0); AH3 e0=CasLoadH(TEXCALL sp0);
AH3 f0=CasLoadH(sp0+ASW2( 1, 0)); AH3 f0=CasLoadH(TEXCALL sp0+ASW2( 1, 0));
AH3 g0=CasLoadH(sp0+ASW2(-1, 1)); AH3 g0=CasLoadH(TEXCALL sp0+ASW2(-1, 1));
AH3 h0=CasLoadH(sp0+ASW2( 0, 1)); AH3 h0=CasLoadH(TEXCALL sp0+ASW2( 0, 1));
AH3 i0=CasLoadH(sp0+ASW2( 1, 1)); AH3 i0=CasLoadH(TEXCALL sp0+ASW2( 1, 1));
ASW2 sp1=sp0+ASW2(8,0); ASW2 sp1=sp0+ASW2(8,0);
AH3 a1=CasLoadH(sp1+ASW2(-1,-1)); AH3 a1=CasLoadH(TEXCALL sp1+ASW2(-1,-1));
AH3 b1=CasLoadH(sp1+ASW2( 0,-1)); AH3 b1=CasLoadH(TEXCALL sp1+ASW2( 0,-1));
AH3 c1=CasLoadH(sp1+ASW2( 1,-1)); AH3 c1=CasLoadH(TEXCALL sp1+ASW2( 1,-1));
AH3 d1=CasLoadH(sp1+ASW2(-1, 0)); AH3 d1=CasLoadH(TEXCALL sp1+ASW2(-1, 0));
AH3 e1=CasLoadH(sp1); AH3 e1=CasLoadH(TEXCALL sp1);
AH3 f1=CasLoadH(sp1+ASW2( 1, 0)); AH3 f1=CasLoadH(TEXCALL sp1+ASW2( 1, 0));
AH3 g1=CasLoadH(sp1+ASW2(-1, 1)); AH3 g1=CasLoadH(TEXCALL sp1+ASW2(-1, 1));
AH3 h1=CasLoadH(sp1+ASW2( 0, 1)); AH3 h1=CasLoadH(TEXCALL sp1+ASW2( 0, 1));
AH3 i1=CasLoadH(sp1+ASW2( 1, 1)); AH3 i1=CasLoadH(TEXCALL sp1+ASW2( 1, 1));
// AOS to SOA conversion. // AOS to SOA conversion.
AH2 aR=AH2(a0.r,a1.r); AH2 aR=AH2(a0.r,a1.r);
AH2 aG=AH2(a0.g,a1.g); AH2 aG=AH2(a0.g,a1.g);
@ -1033,10 +1061,10 @@ A_STATIC void CasSetup(
ampB=APrxLoSqrtH2(ampB); ampB=APrxLoSqrtH2(ampB);
#endif #endif
// Filter shape. // Filter shape.
AH1 peak=AH2_AU1(const1.y).x; A_MAYBE_UNUSED AH1 peak=AH2_AU1(const1.y).x;
AH2 wR=ampR*AH2_(peak); A_MAYBE_UNUSED AH2 wR=ampR*AH2_(peak);
AH2 wG=ampG*AH2_(peak); A_MAYBE_UNUSED AH2 wG=ampG*AH2_(peak);
AH2 wB=ampB*AH2_(peak); A_MAYBE_UNUSED AH2 wB=ampB*AH2_(peak);
// Filter. // Filter.
#ifndef CAS_SLOW #ifndef CAS_SLOW
#ifdef CAS_GO_SLOWER #ifdef CAS_GO_SLOWER
@ -1072,43 +1100,43 @@ A_STATIC void CasSetup(
ppX.x=AH1(pp.x-fp0.x); ppX.x=AH1(pp.x-fp0.x);
AH1 ppY=AH1(pp.y-fp0.y); AH1 ppY=AH1(pp.y-fp0.y);
ASW2 sp0=ASW2(fp0); ASW2 sp0=ASW2(fp0);
AH3 a0=CasLoadH(sp0+ASW2(-1,-1)); AH3 a0=CasLoadH(TEXCALL sp0+ASW2(-1,-1));
AH3 b0=CasLoadH(sp0+ASW2( 0,-1)); AH3 b0=CasLoadH(TEXCALL sp0+ASW2( 0,-1));
AH3 e0=CasLoadH(sp0+ASW2(-1, 0)); AH3 e0=CasLoadH(TEXCALL sp0+ASW2(-1, 0));
AH3 f0=CasLoadH(sp0); AH3 f0=CasLoadH(TEXCALL sp0);
AH3 c0=CasLoadH(sp0+ASW2( 1,-1)); AH3 c0=CasLoadH(TEXCALL sp0+ASW2( 1,-1));
AH3 d0=CasLoadH(sp0+ASW2( 2,-1)); AH3 d0=CasLoadH(TEXCALL sp0+ASW2( 2,-1));
AH3 g0=CasLoadH(sp0+ASW2( 1, 0)); AH3 g0=CasLoadH(TEXCALL sp0+ASW2( 1, 0));
AH3 h0=CasLoadH(sp0+ASW2( 2, 0)); AH3 h0=CasLoadH(TEXCALL sp0+ASW2( 2, 0));
AH3 i0=CasLoadH(sp0+ASW2(-1, 1)); AH3 i0=CasLoadH(TEXCALL sp0+ASW2(-1, 1));
AH3 j0=CasLoadH(sp0+ASW2( 0, 1)); AH3 j0=CasLoadH(TEXCALL sp0+ASW2( 0, 1));
AH3 m0=CasLoadH(sp0+ASW2(-1, 2)); AH3 m0=CasLoadH(TEXCALL sp0+ASW2(-1, 2));
AH3 n0=CasLoadH(sp0+ASW2( 0, 2)); AH3 n0=CasLoadH(TEXCALL sp0+ASW2( 0, 2));
AH3 k0=CasLoadH(sp0+ASW2( 1, 1)); AH3 k0=CasLoadH(TEXCALL sp0+ASW2( 1, 1));
AH3 l0=CasLoadH(sp0+ASW2( 2, 1)); AH3 l0=CasLoadH(TEXCALL sp0+ASW2( 2, 1));
AH3 o0=CasLoadH(sp0+ASW2( 1, 2)); AH3 o0=CasLoadH(TEXCALL sp0+ASW2( 1, 2));
AH3 p0=CasLoadH(sp0+ASW2( 2, 2)); AH3 p0=CasLoadH(TEXCALL sp0+ASW2( 2, 2));
// Tile 1 (offset only in x). // Tile 1 (offset only in x).
AF1 pp1=pp.x+AF1_AU1(const1.z); AF1 pp1=pp.x+AF1_AU1(const1.z);
AF1 fp1=floor(pp1); AF1 fp1=floor(pp1);
ppX.y=AH1(pp1-fp1); ppX.y=AH1(pp1-fp1);
ASW2 sp1=ASW2(fp1,sp0.y); ASW2 sp1=ASW2(fp1,sp0.y);
AH3 a1=CasLoadH(sp1+ASW2(-1,-1)); AH3 a1=CasLoadH(TEXCALL sp1+ASW2(-1,-1));
AH3 b1=CasLoadH(sp1+ASW2( 0,-1)); AH3 b1=CasLoadH(TEXCALL sp1+ASW2( 0,-1));
AH3 e1=CasLoadH(sp1+ASW2(-1, 0)); AH3 e1=CasLoadH(TEXCALL sp1+ASW2(-1, 0));
AH3 f1=CasLoadH(sp1); AH3 f1=CasLoadH(TEXCALL sp1);
AH3 c1=CasLoadH(sp1+ASW2( 1,-1)); AH3 c1=CasLoadH(TEXCALL sp1+ASW2( 1,-1));
AH3 d1=CasLoadH(sp1+ASW2( 2,-1)); AH3 d1=CasLoadH(TEXCALL sp1+ASW2( 2,-1));
AH3 g1=CasLoadH(sp1+ASW2( 1, 0)); AH3 g1=CasLoadH(TEXCALL sp1+ASW2( 1, 0));
AH3 h1=CasLoadH(sp1+ASW2( 2, 0)); AH3 h1=CasLoadH(TEXCALL sp1+ASW2( 2, 0));
AH3 i1=CasLoadH(sp1+ASW2(-1, 1)); AH3 i1=CasLoadH(TEXCALL sp1+ASW2(-1, 1));
AH3 j1=CasLoadH(sp1+ASW2( 0, 1)); AH3 j1=CasLoadH(TEXCALL sp1+ASW2( 0, 1));
AH3 m1=CasLoadH(sp1+ASW2(-1, 2)); AH3 m1=CasLoadH(TEXCALL sp1+ASW2(-1, 2));
AH3 n1=CasLoadH(sp1+ASW2( 0, 2)); AH3 n1=CasLoadH(TEXCALL sp1+ASW2( 0, 2));
AH3 k1=CasLoadH(sp1+ASW2( 1, 1)); AH3 k1=CasLoadH(TEXCALL sp1+ASW2( 1, 1));
AH3 l1=CasLoadH(sp1+ASW2( 2, 1)); AH3 l1=CasLoadH(TEXCALL sp1+ASW2( 2, 1));
AH3 o1=CasLoadH(sp1+ASW2( 1, 2)); AH3 o1=CasLoadH(TEXCALL sp1+ASW2( 1, 2));
AH3 p1=CasLoadH(sp1+ASW2( 2, 2)); AH3 p1=CasLoadH(TEXCALL sp1+ASW2( 2, 2));
// AOS to SOA conversion. // AOS to SOA conversion.
AH2 aR=AH2(a0.r,a1.r); AH2 aR=AH2(a0.r,a1.r);
AH2 aG=AH2(a0.g,a1.g); AH2 aG=AH2(a0.g,a1.g);
@ -1393,30 +1421,30 @@ A_STATIC void CasSetup(
v*=APrxLoRcpH2(thinB+(mxkG-mnkG)); v*=APrxLoRcpH2(thinB+(mxkG-mnkG));
#endif #endif
// Final weighting. // Final weighting.
AH2 qbeR=wfR*s; A_MAYBE_UNUSED AH2 qbeR=wfR*s;
AH2 qbeG=wfG*s; A_MAYBE_UNUSED AH2 qbeG=wfG*s;
AH2 qbeB=wfB*s; A_MAYBE_UNUSED AH2 qbeB=wfB*s;
AH2 qchR=wgR*t; A_MAYBE_UNUSED AH2 qchR=wgR*t;
AH2 qchG=wgG*t; A_MAYBE_UNUSED AH2 qchG=wgG*t;
AH2 qchB=wgB*t; A_MAYBE_UNUSED AH2 qchB=wgB*t;
AH2 qfR=wgR*t+wjR*u+s; A_MAYBE_UNUSED AH2 qfR=wgR*t+wjR*u+s;
AH2 qfG=wgG*t+wjG*u+s; A_MAYBE_UNUSED AH2 qfG=wgG*t+wjG*u+s;
AH2 qfB=wgB*t+wjB*u+s; A_MAYBE_UNUSED AH2 qfB=wgB*t+wjB*u+s;
AH2 qgR=wfR*s+wkR*v+t; A_MAYBE_UNUSED AH2 qgR=wfR*s+wkR*v+t;
AH2 qgG=wfG*s+wkG*v+t; A_MAYBE_UNUSED AH2 qgG=wfG*s+wkG*v+t;
AH2 qgB=wfB*s+wkB*v+t; A_MAYBE_UNUSED AH2 qgB=wfB*s+wkB*v+t;
AH2 qjR=wfR*s+wkR*v+u; A_MAYBE_UNUSED AH2 qjR=wfR*s+wkR*v+u;
AH2 qjG=wfG*s+wkG*v+u; A_MAYBE_UNUSED AH2 qjG=wfG*s+wkG*v+u;
AH2 qjB=wfB*s+wkB*v+u; A_MAYBE_UNUSED AH2 qjB=wfB*s+wkB*v+u;
AH2 qkR=wgR*t+wjR*u+v; A_MAYBE_UNUSED AH2 qkR=wgR*t+wjR*u+v;
AH2 qkG=wgG*t+wjG*u+v; A_MAYBE_UNUSED AH2 qkG=wgG*t+wjG*u+v;
AH2 qkB=wgB*t+wjB*u+v; A_MAYBE_UNUSED AH2 qkB=wgB*t+wjB*u+v;
AH2 qinR=wjR*u; A_MAYBE_UNUSED AH2 qinR=wjR*u;
AH2 qinG=wjG*u; A_MAYBE_UNUSED AH2 qinG=wjG*u;
AH2 qinB=wjB*u; A_MAYBE_UNUSED AH2 qinB=wjB*u;
AH2 qloR=wkR*v; A_MAYBE_UNUSED AH2 qloR=wkR*v;
AH2 qloG=wkG*v; A_MAYBE_UNUSED AH2 qloG=wkG*v;
AH2 qloB=wkB*v; A_MAYBE_UNUSED AH2 qloB=wkB*v;
// Filter. // Filter.
#ifndef CAS_SLOW #ifndef CAS_SLOW
#ifdef CAS_GO_SLOWER #ifdef CAS_GO_SLOWER
@ -1442,4 +1470,7 @@ A_STATIC void CasSetup(
pixB=ASatH2((bB*qbeB+eB*qbeB+cB*qchB+hB*qchB+iB*qinB+nB*qinB+lB*qloB+oB*qloB+fB*qfB+gB*qgB+jB*qjB+kB*qkB)*rcpWB); pixB=ASatH2((bB*qbeB+eB*qbeB+cB*qchB+hB*qchB+iB*qinB+nB*qinB+lB*qloB+oB*qloB+fB*qfB+gB*qgB+jB*qjB+kB*qkB)*rcpWB);
#endif #endif
} }
#undef TEXINPUT
#undef TEXCALL
#endif #endif