mirror of https://github.com/PCSX2/pcsx2.git
Resources: ffx_cas.h Metal support
This commit is contained in:
parent
cef8d03d49
commit
2dfb819d35
|
@ -96,6 +96,8 @@
|
||||||
// // The ASU2() typecast back to 32-bit is a NO-OP, the compiler pattern matches and uses A16 opcode support instead.
|
// // The ASU2() typecast back to 32-bit is a NO-OP, the compiler pattern matches and uses A16 opcode support instead.
|
||||||
// // The AH3() typecast to 16-bit float is a NO-OP, the compiler pattern matches and uses D16 opcode support instead.
|
// // The AH3() typecast to 16-bit float is a NO-OP, the compiler pattern matches and uses D16 opcode support instead.
|
||||||
// AH3 CasLoadH(ASW2 p){return AH3(imageLoad(imgSrc,ASU2(p)).rgb);}
|
// AH3 CasLoadH(ASW2 p){return AH3(imageLoad(imgSrc,ASU2(p)).rgb);}
|
||||||
|
// // If you define CAS_TEXTURE and/or CAS_TEXTUREH to a type, a value of that type will be added as the first input to CasFilter and forwarded to CasLoad
|
||||||
|
// // This is useful for forwarding extra data to the load functions, and is required by MSL, which doesn't use global textures
|
||||||
// ...
|
// ...
|
||||||
// // Define the input modifiers as nop's initially.
|
// // Define the input modifiers as nop's initially.
|
||||||
// // See "INPUT FORMAT SPECIFIC CASES" below for specifics on what to place in these functions.
|
// // See "INPUT FORMAT SPECIFIC CASES" below for specifics on what to place in these functions.
|
||||||
|
@ -399,16 +401,28 @@ A_STATIC void CasSetup(
|
||||||
// NON-PACKED VERSION
|
// NON-PACKED VERSION
|
||||||
//==============================================================================================================================
|
//==============================================================================================================================
|
||||||
#ifdef A_GPU
|
#ifdef A_GPU
|
||||||
|
#if defined(A_MSL) && !defined(CAS_TEXTURE)
|
||||||
|
#define CAS_TEXTURE texture2d<float>
|
||||||
|
#endif
|
||||||
|
#ifdef CAS_TEXTURE
|
||||||
|
#define TEXCALL tex,
|
||||||
|
#define TEXINPUT CAS_TEXTURE tex,
|
||||||
|
#else
|
||||||
|
#define TEXCALL
|
||||||
|
#define TEXINPUT
|
||||||
|
#endif
|
||||||
#ifdef CAS_PACKED_ONLY
|
#ifdef CAS_PACKED_ONLY
|
||||||
// Avoid compiler error.
|
// Avoid compiler error.
|
||||||
AF3 CasLoad(ASU2 p){return AF3(0.0,0.0,0.0);}
|
A_STATIC AF3 CasLoad(ASU2 p){return AF3(0.0,0.0,0.0);}
|
||||||
void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){}
|
// MSL Doesn't let you inout vector elements, so use a macro
|
||||||
|
#define CasInput(r,g,b)
|
||||||
#endif
|
#endif
|
||||||
//------------------------------------------------------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------------------------------------------------------
|
||||||
void CasFilter(
|
A_STATIC void CasFilter(
|
||||||
out AF1 pixR, // Output values, non-vector so port between CasFilter() and CasFilterH() is easy.
|
TEXINPUT
|
||||||
out AF1 pixG,
|
outAF1 pixR, // Output values, non-vector so port between CasFilter() and CasFilterH() is easy.
|
||||||
out AF1 pixB,
|
outAF1 pixG,
|
||||||
|
outAF1 pixB,
|
||||||
AU2 ip, // Integer pixel position in output.
|
AU2 ip, // Integer pixel position in output.
|
||||||
AU4 const0, // Constants generated by CasSetup().
|
AU4 const0, // Constants generated by CasSetup().
|
||||||
AU4 const1,
|
AU4 const1,
|
||||||
|
@ -426,15 +440,15 @@ A_STATIC void CasSetup(
|
||||||
// d e f
|
// d e f
|
||||||
// g h i
|
// g h i
|
||||||
ASU2 sp=ASU2(ip);
|
ASU2 sp=ASU2(ip);
|
||||||
AF3 a=CasLoad(sp+ASU2(-1,-1));
|
A_MAYBE_UNUSED AF3 a=CasLoad(TEXCALL sp+ASU2(-1,-1));
|
||||||
AF3 b=CasLoad(sp+ASU2( 0,-1));
|
A_MAYBE_UNUSED AF3 b=CasLoad(TEXCALL sp+ASU2( 0,-1));
|
||||||
AF3 c=CasLoad(sp+ASU2( 1,-1));
|
A_MAYBE_UNUSED AF3 c=CasLoad(TEXCALL sp+ASU2( 1,-1));
|
||||||
AF3 d=CasLoad(sp+ASU2(-1, 0));
|
A_MAYBE_UNUSED AF3 d=CasLoad(TEXCALL sp+ASU2(-1, 0));
|
||||||
AF3 e=CasLoad(sp);
|
A_MAYBE_UNUSED AF3 e=CasLoad(TEXCALL sp);
|
||||||
AF3 f=CasLoad(sp+ASU2( 1, 0));
|
A_MAYBE_UNUSED AF3 f=CasLoad(TEXCALL sp+ASU2( 1, 0));
|
||||||
AF3 g=CasLoad(sp+ASU2(-1, 1));
|
A_MAYBE_UNUSED AF3 g=CasLoad(TEXCALL sp+ASU2(-1, 1));
|
||||||
AF3 h=CasLoad(sp+ASU2( 0, 1));
|
A_MAYBE_UNUSED AF3 h=CasLoad(TEXCALL sp+ASU2( 0, 1));
|
||||||
AF3 i=CasLoad(sp+ASU2( 1, 1));
|
A_MAYBE_UNUSED AF3 i=CasLoad(TEXCALL sp+ASU2( 1, 1));
|
||||||
// Run optional input transform.
|
// Run optional input transform.
|
||||||
CasInput(a.r,a.g,a.b);
|
CasInput(a.r,a.g,a.b);
|
||||||
CasInput(b.r,b.g,b.b);
|
CasInput(b.r,b.g,b.b);
|
||||||
|
@ -505,10 +519,10 @@ A_STATIC void CasSetup(
|
||||||
// 0 w 0
|
// 0 w 0
|
||||||
// w 1 w
|
// w 1 w
|
||||||
// 0 w 0
|
// 0 w 0
|
||||||
AF1 peak=AF1_AU1(const1.x);
|
A_MAYBE_UNUSED AF1 peak=AF1_AU1(const1.x);
|
||||||
AF1 wR=ampR*peak;
|
A_MAYBE_UNUSED AF1 wR=ampR*peak;
|
||||||
AF1 wG=ampG*peak;
|
A_MAYBE_UNUSED AF1 wG=ampG*peak;
|
||||||
AF1 wB=ampB*peak;
|
A_MAYBE_UNUSED AF1 wB=ampB*peak;
|
||||||
// Filter.
|
// Filter.
|
||||||
#ifndef CAS_SLOW
|
#ifndef CAS_SLOW
|
||||||
// Using green coef only, depending on dead code removal to strip out the extra overhead.
|
// Using green coef only, depending on dead code removal to strip out the extra overhead.
|
||||||
|
@ -555,22 +569,22 @@ A_STATIC void CasSetup(
|
||||||
AF2 fp=floor(pp);
|
AF2 fp=floor(pp);
|
||||||
pp-=fp;
|
pp-=fp;
|
||||||
ASU2 sp=ASU2(fp);
|
ASU2 sp=ASU2(fp);
|
||||||
AF3 a=CasLoad(sp+ASU2(-1,-1));
|
A_MAYBE_UNUSED AF3 a=CasLoad(TEXCALL sp+ASU2(-1,-1));
|
||||||
AF3 b=CasLoad(sp+ASU2( 0,-1));
|
A_MAYBE_UNUSED AF3 b=CasLoad(TEXCALL sp+ASU2( 0,-1));
|
||||||
AF3 e=CasLoad(sp+ASU2(-1, 0));
|
A_MAYBE_UNUSED AF3 e=CasLoad(TEXCALL sp+ASU2(-1, 0));
|
||||||
AF3 f=CasLoad(sp);
|
A_MAYBE_UNUSED AF3 f=CasLoad(TEXCALL sp);
|
||||||
AF3 c=CasLoad(sp+ASU2( 1,-1));
|
A_MAYBE_UNUSED AF3 c=CasLoad(TEXCALL sp+ASU2( 1,-1));
|
||||||
AF3 d=CasLoad(sp+ASU2( 2,-1));
|
A_MAYBE_UNUSED AF3 d=CasLoad(TEXCALL sp+ASU2( 2,-1));
|
||||||
AF3 g=CasLoad(sp+ASU2( 1, 0));
|
A_MAYBE_UNUSED AF3 g=CasLoad(TEXCALL sp+ASU2( 1, 0));
|
||||||
AF3 h=CasLoad(sp+ASU2( 2, 0));
|
A_MAYBE_UNUSED AF3 h=CasLoad(TEXCALL sp+ASU2( 2, 0));
|
||||||
AF3 i=CasLoad(sp+ASU2(-1, 1));
|
A_MAYBE_UNUSED AF3 i=CasLoad(TEXCALL sp+ASU2(-1, 1));
|
||||||
AF3 j=CasLoad(sp+ASU2( 0, 1));
|
A_MAYBE_UNUSED AF3 j=CasLoad(TEXCALL sp+ASU2( 0, 1));
|
||||||
AF3 m=CasLoad(sp+ASU2(-1, 2));
|
A_MAYBE_UNUSED AF3 m=CasLoad(TEXCALL sp+ASU2(-1, 2));
|
||||||
AF3 n=CasLoad(sp+ASU2( 0, 2));
|
A_MAYBE_UNUSED AF3 n=CasLoad(TEXCALL sp+ASU2( 0, 2));
|
||||||
AF3 k=CasLoad(sp+ASU2( 1, 1));
|
A_MAYBE_UNUSED AF3 k=CasLoad(TEXCALL sp+ASU2( 1, 1));
|
||||||
AF3 l=CasLoad(sp+ASU2( 2, 1));
|
A_MAYBE_UNUSED AF3 l=CasLoad(TEXCALL sp+ASU2( 2, 1));
|
||||||
AF3 o=CasLoad(sp+ASU2( 1, 2));
|
A_MAYBE_UNUSED AF3 o=CasLoad(TEXCALL sp+ASU2( 1, 2));
|
||||||
AF3 p=CasLoad(sp+ASU2( 2, 2));
|
A_MAYBE_UNUSED AF3 p=CasLoad(TEXCALL sp+ASU2( 2, 2));
|
||||||
// Run optional input transform.
|
// Run optional input transform.
|
||||||
CasInput(a.r,a.g,a.b);
|
CasInput(a.r,a.g,a.b);
|
||||||
CasInput(b.r,b.g,b.b);
|
CasInput(b.r,b.g,b.b);
|
||||||
|
@ -827,30 +841,30 @@ A_STATIC void CasSetup(
|
||||||
// _____ _____ _____ _____
|
// _____ _____ _____ _____
|
||||||
//
|
//
|
||||||
// ju kv
|
// ju kv
|
||||||
AF1 qbeR=wfR*s;
|
A_MAYBE_UNUSED AF1 qbeR=wfR*s;
|
||||||
AF1 qbeG=wfG*s;
|
A_MAYBE_UNUSED AF1 qbeG=wfG*s;
|
||||||
AF1 qbeB=wfB*s;
|
A_MAYBE_UNUSED AF1 qbeB=wfB*s;
|
||||||
AF1 qchR=wgR*t;
|
A_MAYBE_UNUSED AF1 qchR=wgR*t;
|
||||||
AF1 qchG=wgG*t;
|
A_MAYBE_UNUSED AF1 qchG=wgG*t;
|
||||||
AF1 qchB=wgB*t;
|
A_MAYBE_UNUSED AF1 qchB=wgB*t;
|
||||||
AF1 qfR=wgR*t+wjR*u+s;
|
A_MAYBE_UNUSED AF1 qfR=wgR*t+wjR*u+s;
|
||||||
AF1 qfG=wgG*t+wjG*u+s;
|
A_MAYBE_UNUSED AF1 qfG=wgG*t+wjG*u+s;
|
||||||
AF1 qfB=wgB*t+wjB*u+s;
|
A_MAYBE_UNUSED AF1 qfB=wgB*t+wjB*u+s;
|
||||||
AF1 qgR=wfR*s+wkR*v+t;
|
A_MAYBE_UNUSED AF1 qgR=wfR*s+wkR*v+t;
|
||||||
AF1 qgG=wfG*s+wkG*v+t;
|
A_MAYBE_UNUSED AF1 qgG=wfG*s+wkG*v+t;
|
||||||
AF1 qgB=wfB*s+wkB*v+t;
|
A_MAYBE_UNUSED AF1 qgB=wfB*s+wkB*v+t;
|
||||||
AF1 qjR=wfR*s+wkR*v+u;
|
A_MAYBE_UNUSED AF1 qjR=wfR*s+wkR*v+u;
|
||||||
AF1 qjG=wfG*s+wkG*v+u;
|
A_MAYBE_UNUSED AF1 qjG=wfG*s+wkG*v+u;
|
||||||
AF1 qjB=wfB*s+wkB*v+u;
|
A_MAYBE_UNUSED AF1 qjB=wfB*s+wkB*v+u;
|
||||||
AF1 qkR=wgR*t+wjR*u+v;
|
A_MAYBE_UNUSED AF1 qkR=wgR*t+wjR*u+v;
|
||||||
AF1 qkG=wgG*t+wjG*u+v;
|
A_MAYBE_UNUSED AF1 qkG=wgG*t+wjG*u+v;
|
||||||
AF1 qkB=wgB*t+wjB*u+v;
|
A_MAYBE_UNUSED AF1 qkB=wgB*t+wjB*u+v;
|
||||||
AF1 qinR=wjR*u;
|
A_MAYBE_UNUSED AF1 qinR=wjR*u;
|
||||||
AF1 qinG=wjG*u;
|
A_MAYBE_UNUSED AF1 qinG=wjG*u;
|
||||||
AF1 qinB=wjB*u;
|
A_MAYBE_UNUSED AF1 qinB=wjB*u;
|
||||||
AF1 qloR=wkR*v;
|
A_MAYBE_UNUSED AF1 qloR=wkR*v;
|
||||||
AF1 qloG=wkG*v;
|
A_MAYBE_UNUSED AF1 qloG=wkG*v;
|
||||||
AF1 qloB=wkB*v;
|
A_MAYBE_UNUSED AF1 qloB=wkB*v;
|
||||||
// Filter.
|
// Filter.
|
||||||
#ifndef CAS_SLOW
|
#ifndef CAS_SLOW
|
||||||
// Using green coef only, depending on dead code removal to strip out the extra overhead.
|
// Using green coef only, depending on dead code removal to strip out the extra overhead.
|
||||||
|
@ -877,6 +891,9 @@ A_STATIC void CasSetup(
|
||||||
pixB=ASatF1((b.b*qbeB+e.b*qbeB+c.b*qchB+h.b*qchB+i.b*qinB+n.b*qinB+l.b*qloB+o.b*qloB+f.b*qfB+g.b*qgB+j.b*qjB+k.b*qkB)*rcpWB);
|
pixB=ASatF1((b.b*qbeB+e.b*qbeB+c.b*qchB+h.b*qchB+i.b*qinB+n.b*qinB+l.b*qloB+o.b*qloB+f.b*qfB+g.b*qgB+j.b*qjB+k.b*qkB)*rcpWB);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#undef TEXINPUT
|
||||||
|
#undef TEXCALL
|
||||||
#endif
|
#endif
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -885,6 +902,16 @@ A_STATIC void CasSetup(
|
||||||
// PACKED VERSION
|
// PACKED VERSION
|
||||||
//==============================================================================================================================
|
//==============================================================================================================================
|
||||||
#if defined(A_GPU) && defined(A_HALF)
|
#if defined(A_GPU) && defined(A_HALF)
|
||||||
|
#if defined(A_MSL) && !defined(CAS_TEXTUREH)
|
||||||
|
#define CAS_TEXTUREH texture2d<half>
|
||||||
|
#endif
|
||||||
|
#ifdef CAS_TEXTUREH
|
||||||
|
#define TEXCALL tex,
|
||||||
|
#define TEXINPUT CAS_TEXTUREH tex,
|
||||||
|
#else
|
||||||
|
#define TEXCALL
|
||||||
|
#define TEXINPUT
|
||||||
|
#endif
|
||||||
// Missing a way to do packed re-interpetation, so must disable approximation optimizations.
|
// Missing a way to do packed re-interpetation, so must disable approximation optimizations.
|
||||||
#ifdef A_HLSL
|
#ifdef A_HLSL
|
||||||
#ifndef CAS_GO_SLOWER
|
#ifndef CAS_GO_SLOWER
|
||||||
|
@ -893,7 +920,7 @@ A_STATIC void CasSetup(
|
||||||
#endif
|
#endif
|
||||||
//==============================================================================================================================
|
//==============================================================================================================================
|
||||||
// Can be used to convert from packed SOA to AOS for store.
|
// Can be used to convert from packed SOA to AOS for store.
|
||||||
void CasDepack(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
|
void CasDepack(outAH4 pix0,outAH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
|
||||||
#ifdef A_HLSL
|
#ifdef A_HLSL
|
||||||
// Invoke a slower path for DX only, since it won't allow uninitialized values.
|
// Invoke a slower path for DX only, since it won't allow uninitialized values.
|
||||||
pix0.a=pix1.a=0.0;
|
pix0.a=pix1.a=0.0;
|
||||||
|
@ -902,13 +929,14 @@ A_STATIC void CasSetup(
|
||||||
pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
|
pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
|
||||||
//==============================================================================================================================
|
//==============================================================================================================================
|
||||||
void CasFilterH(
|
void CasFilterH(
|
||||||
|
TEXINPUT
|
||||||
// Output values are for 2 8x8 tiles in a 16x8 region.
|
// Output values are for 2 8x8 tiles in a 16x8 region.
|
||||||
// pix<R,G,B>.x = right 8x8 tile
|
// pix<R,G,B>.x = right 8x8 tile
|
||||||
// pix<R,G,B>.y = left 8x8 tile
|
// pix<R,G,B>.y = left 8x8 tile
|
||||||
// This enables later processing to easily be packed as well.
|
// This enables later processing to easily be packed as well.
|
||||||
out AH2 pixR,
|
outAH2 pixR,
|
||||||
out AH2 pixG,
|
outAH2 pixG,
|
||||||
out AH2 pixB,
|
outAH2 pixB,
|
||||||
AU2 ip, // Integer pixel position in output.
|
AU2 ip, // Integer pixel position in output.
|
||||||
AU4 const0, // Constants generated by CasSetup().
|
AU4 const0, // Constants generated by CasSetup().
|
||||||
AU4 const1,
|
AU4 const1,
|
||||||
|
@ -923,25 +951,25 @@ A_STATIC void CasSetup(
|
||||||
// No scaling algorithm uses minimal 3x3 pixel neighborhood.
|
// No scaling algorithm uses minimal 3x3 pixel neighborhood.
|
||||||
if(noScaling){
|
if(noScaling){
|
||||||
ASW2 sp0=ASW2(ip);
|
ASW2 sp0=ASW2(ip);
|
||||||
AH3 a0=CasLoadH(sp0+ASW2(-1,-1));
|
AH3 a0=CasLoadH(TEXCALL sp0+ASW2(-1,-1));
|
||||||
AH3 b0=CasLoadH(sp0+ASW2( 0,-1));
|
AH3 b0=CasLoadH(TEXCALL sp0+ASW2( 0,-1));
|
||||||
AH3 c0=CasLoadH(sp0+ASW2( 1,-1));
|
AH3 c0=CasLoadH(TEXCALL sp0+ASW2( 1,-1));
|
||||||
AH3 d0=CasLoadH(sp0+ASW2(-1, 0));
|
AH3 d0=CasLoadH(TEXCALL sp0+ASW2(-1, 0));
|
||||||
AH3 e0=CasLoadH(sp0);
|
AH3 e0=CasLoadH(TEXCALL sp0);
|
||||||
AH3 f0=CasLoadH(sp0+ASW2( 1, 0));
|
AH3 f0=CasLoadH(TEXCALL sp0+ASW2( 1, 0));
|
||||||
AH3 g0=CasLoadH(sp0+ASW2(-1, 1));
|
AH3 g0=CasLoadH(TEXCALL sp0+ASW2(-1, 1));
|
||||||
AH3 h0=CasLoadH(sp0+ASW2( 0, 1));
|
AH3 h0=CasLoadH(TEXCALL sp0+ASW2( 0, 1));
|
||||||
AH3 i0=CasLoadH(sp0+ASW2( 1, 1));
|
AH3 i0=CasLoadH(TEXCALL sp0+ASW2( 1, 1));
|
||||||
ASW2 sp1=sp0+ASW2(8,0);
|
ASW2 sp1=sp0+ASW2(8,0);
|
||||||
AH3 a1=CasLoadH(sp1+ASW2(-1,-1));
|
AH3 a1=CasLoadH(TEXCALL sp1+ASW2(-1,-1));
|
||||||
AH3 b1=CasLoadH(sp1+ASW2( 0,-1));
|
AH3 b1=CasLoadH(TEXCALL sp1+ASW2( 0,-1));
|
||||||
AH3 c1=CasLoadH(sp1+ASW2( 1,-1));
|
AH3 c1=CasLoadH(TEXCALL sp1+ASW2( 1,-1));
|
||||||
AH3 d1=CasLoadH(sp1+ASW2(-1, 0));
|
AH3 d1=CasLoadH(TEXCALL sp1+ASW2(-1, 0));
|
||||||
AH3 e1=CasLoadH(sp1);
|
AH3 e1=CasLoadH(TEXCALL sp1);
|
||||||
AH3 f1=CasLoadH(sp1+ASW2( 1, 0));
|
AH3 f1=CasLoadH(TEXCALL sp1+ASW2( 1, 0));
|
||||||
AH3 g1=CasLoadH(sp1+ASW2(-1, 1));
|
AH3 g1=CasLoadH(TEXCALL sp1+ASW2(-1, 1));
|
||||||
AH3 h1=CasLoadH(sp1+ASW2( 0, 1));
|
AH3 h1=CasLoadH(TEXCALL sp1+ASW2( 0, 1));
|
||||||
AH3 i1=CasLoadH(sp1+ASW2( 1, 1));
|
AH3 i1=CasLoadH(TEXCALL sp1+ASW2( 1, 1));
|
||||||
// AOS to SOA conversion.
|
// AOS to SOA conversion.
|
||||||
AH2 aR=AH2(a0.r,a1.r);
|
AH2 aR=AH2(a0.r,a1.r);
|
||||||
AH2 aG=AH2(a0.g,a1.g);
|
AH2 aG=AH2(a0.g,a1.g);
|
||||||
|
@ -1033,10 +1061,10 @@ A_STATIC void CasSetup(
|
||||||
ampB=APrxLoSqrtH2(ampB);
|
ampB=APrxLoSqrtH2(ampB);
|
||||||
#endif
|
#endif
|
||||||
// Filter shape.
|
// Filter shape.
|
||||||
AH1 peak=AH2_AU1(const1.y).x;
|
A_MAYBE_UNUSED AH1 peak=AH2_AU1(const1.y).x;
|
||||||
AH2 wR=ampR*AH2_(peak);
|
A_MAYBE_UNUSED AH2 wR=ampR*AH2_(peak);
|
||||||
AH2 wG=ampG*AH2_(peak);
|
A_MAYBE_UNUSED AH2 wG=ampG*AH2_(peak);
|
||||||
AH2 wB=ampB*AH2_(peak);
|
A_MAYBE_UNUSED AH2 wB=ampB*AH2_(peak);
|
||||||
// Filter.
|
// Filter.
|
||||||
#ifndef CAS_SLOW
|
#ifndef CAS_SLOW
|
||||||
#ifdef CAS_GO_SLOWER
|
#ifdef CAS_GO_SLOWER
|
||||||
|
@ -1072,43 +1100,43 @@ A_STATIC void CasSetup(
|
||||||
ppX.x=AH1(pp.x-fp0.x);
|
ppX.x=AH1(pp.x-fp0.x);
|
||||||
AH1 ppY=AH1(pp.y-fp0.y);
|
AH1 ppY=AH1(pp.y-fp0.y);
|
||||||
ASW2 sp0=ASW2(fp0);
|
ASW2 sp0=ASW2(fp0);
|
||||||
AH3 a0=CasLoadH(sp0+ASW2(-1,-1));
|
AH3 a0=CasLoadH(TEXCALL sp0+ASW2(-1,-1));
|
||||||
AH3 b0=CasLoadH(sp0+ASW2( 0,-1));
|
AH3 b0=CasLoadH(TEXCALL sp0+ASW2( 0,-1));
|
||||||
AH3 e0=CasLoadH(sp0+ASW2(-1, 0));
|
AH3 e0=CasLoadH(TEXCALL sp0+ASW2(-1, 0));
|
||||||
AH3 f0=CasLoadH(sp0);
|
AH3 f0=CasLoadH(TEXCALL sp0);
|
||||||
AH3 c0=CasLoadH(sp0+ASW2( 1,-1));
|
AH3 c0=CasLoadH(TEXCALL sp0+ASW2( 1,-1));
|
||||||
AH3 d0=CasLoadH(sp0+ASW2( 2,-1));
|
AH3 d0=CasLoadH(TEXCALL sp0+ASW2( 2,-1));
|
||||||
AH3 g0=CasLoadH(sp0+ASW2( 1, 0));
|
AH3 g0=CasLoadH(TEXCALL sp0+ASW2( 1, 0));
|
||||||
AH3 h0=CasLoadH(sp0+ASW2( 2, 0));
|
AH3 h0=CasLoadH(TEXCALL sp0+ASW2( 2, 0));
|
||||||
AH3 i0=CasLoadH(sp0+ASW2(-1, 1));
|
AH3 i0=CasLoadH(TEXCALL sp0+ASW2(-1, 1));
|
||||||
AH3 j0=CasLoadH(sp0+ASW2( 0, 1));
|
AH3 j0=CasLoadH(TEXCALL sp0+ASW2( 0, 1));
|
||||||
AH3 m0=CasLoadH(sp0+ASW2(-1, 2));
|
AH3 m0=CasLoadH(TEXCALL sp0+ASW2(-1, 2));
|
||||||
AH3 n0=CasLoadH(sp0+ASW2( 0, 2));
|
AH3 n0=CasLoadH(TEXCALL sp0+ASW2( 0, 2));
|
||||||
AH3 k0=CasLoadH(sp0+ASW2( 1, 1));
|
AH3 k0=CasLoadH(TEXCALL sp0+ASW2( 1, 1));
|
||||||
AH3 l0=CasLoadH(sp0+ASW2( 2, 1));
|
AH3 l0=CasLoadH(TEXCALL sp0+ASW2( 2, 1));
|
||||||
AH3 o0=CasLoadH(sp0+ASW2( 1, 2));
|
AH3 o0=CasLoadH(TEXCALL sp0+ASW2( 1, 2));
|
||||||
AH3 p0=CasLoadH(sp0+ASW2( 2, 2));
|
AH3 p0=CasLoadH(TEXCALL sp0+ASW2( 2, 2));
|
||||||
// Tile 1 (offset only in x).
|
// Tile 1 (offset only in x).
|
||||||
AF1 pp1=pp.x+AF1_AU1(const1.z);
|
AF1 pp1=pp.x+AF1_AU1(const1.z);
|
||||||
AF1 fp1=floor(pp1);
|
AF1 fp1=floor(pp1);
|
||||||
ppX.y=AH1(pp1-fp1);
|
ppX.y=AH1(pp1-fp1);
|
||||||
ASW2 sp1=ASW2(fp1,sp0.y);
|
ASW2 sp1=ASW2(fp1,sp0.y);
|
||||||
AH3 a1=CasLoadH(sp1+ASW2(-1,-1));
|
AH3 a1=CasLoadH(TEXCALL sp1+ASW2(-1,-1));
|
||||||
AH3 b1=CasLoadH(sp1+ASW2( 0,-1));
|
AH3 b1=CasLoadH(TEXCALL sp1+ASW2( 0,-1));
|
||||||
AH3 e1=CasLoadH(sp1+ASW2(-1, 0));
|
AH3 e1=CasLoadH(TEXCALL sp1+ASW2(-1, 0));
|
||||||
AH3 f1=CasLoadH(sp1);
|
AH3 f1=CasLoadH(TEXCALL sp1);
|
||||||
AH3 c1=CasLoadH(sp1+ASW2( 1,-1));
|
AH3 c1=CasLoadH(TEXCALL sp1+ASW2( 1,-1));
|
||||||
AH3 d1=CasLoadH(sp1+ASW2( 2,-1));
|
AH3 d1=CasLoadH(TEXCALL sp1+ASW2( 2,-1));
|
||||||
AH3 g1=CasLoadH(sp1+ASW2( 1, 0));
|
AH3 g1=CasLoadH(TEXCALL sp1+ASW2( 1, 0));
|
||||||
AH3 h1=CasLoadH(sp1+ASW2( 2, 0));
|
AH3 h1=CasLoadH(TEXCALL sp1+ASW2( 2, 0));
|
||||||
AH3 i1=CasLoadH(sp1+ASW2(-1, 1));
|
AH3 i1=CasLoadH(TEXCALL sp1+ASW2(-1, 1));
|
||||||
AH3 j1=CasLoadH(sp1+ASW2( 0, 1));
|
AH3 j1=CasLoadH(TEXCALL sp1+ASW2( 0, 1));
|
||||||
AH3 m1=CasLoadH(sp1+ASW2(-1, 2));
|
AH3 m1=CasLoadH(TEXCALL sp1+ASW2(-1, 2));
|
||||||
AH3 n1=CasLoadH(sp1+ASW2( 0, 2));
|
AH3 n1=CasLoadH(TEXCALL sp1+ASW2( 0, 2));
|
||||||
AH3 k1=CasLoadH(sp1+ASW2( 1, 1));
|
AH3 k1=CasLoadH(TEXCALL sp1+ASW2( 1, 1));
|
||||||
AH3 l1=CasLoadH(sp1+ASW2( 2, 1));
|
AH3 l1=CasLoadH(TEXCALL sp1+ASW2( 2, 1));
|
||||||
AH3 o1=CasLoadH(sp1+ASW2( 1, 2));
|
AH3 o1=CasLoadH(TEXCALL sp1+ASW2( 1, 2));
|
||||||
AH3 p1=CasLoadH(sp1+ASW2( 2, 2));
|
AH3 p1=CasLoadH(TEXCALL sp1+ASW2( 2, 2));
|
||||||
// AOS to SOA conversion.
|
// AOS to SOA conversion.
|
||||||
AH2 aR=AH2(a0.r,a1.r);
|
AH2 aR=AH2(a0.r,a1.r);
|
||||||
AH2 aG=AH2(a0.g,a1.g);
|
AH2 aG=AH2(a0.g,a1.g);
|
||||||
|
@ -1393,30 +1421,30 @@ A_STATIC void CasSetup(
|
||||||
v*=APrxLoRcpH2(thinB+(mxkG-mnkG));
|
v*=APrxLoRcpH2(thinB+(mxkG-mnkG));
|
||||||
#endif
|
#endif
|
||||||
// Final weighting.
|
// Final weighting.
|
||||||
AH2 qbeR=wfR*s;
|
A_MAYBE_UNUSED AH2 qbeR=wfR*s;
|
||||||
AH2 qbeG=wfG*s;
|
A_MAYBE_UNUSED AH2 qbeG=wfG*s;
|
||||||
AH2 qbeB=wfB*s;
|
A_MAYBE_UNUSED AH2 qbeB=wfB*s;
|
||||||
AH2 qchR=wgR*t;
|
A_MAYBE_UNUSED AH2 qchR=wgR*t;
|
||||||
AH2 qchG=wgG*t;
|
A_MAYBE_UNUSED AH2 qchG=wgG*t;
|
||||||
AH2 qchB=wgB*t;
|
A_MAYBE_UNUSED AH2 qchB=wgB*t;
|
||||||
AH2 qfR=wgR*t+wjR*u+s;
|
A_MAYBE_UNUSED AH2 qfR=wgR*t+wjR*u+s;
|
||||||
AH2 qfG=wgG*t+wjG*u+s;
|
A_MAYBE_UNUSED AH2 qfG=wgG*t+wjG*u+s;
|
||||||
AH2 qfB=wgB*t+wjB*u+s;
|
A_MAYBE_UNUSED AH2 qfB=wgB*t+wjB*u+s;
|
||||||
AH2 qgR=wfR*s+wkR*v+t;
|
A_MAYBE_UNUSED AH2 qgR=wfR*s+wkR*v+t;
|
||||||
AH2 qgG=wfG*s+wkG*v+t;
|
A_MAYBE_UNUSED AH2 qgG=wfG*s+wkG*v+t;
|
||||||
AH2 qgB=wfB*s+wkB*v+t;
|
A_MAYBE_UNUSED AH2 qgB=wfB*s+wkB*v+t;
|
||||||
AH2 qjR=wfR*s+wkR*v+u;
|
A_MAYBE_UNUSED AH2 qjR=wfR*s+wkR*v+u;
|
||||||
AH2 qjG=wfG*s+wkG*v+u;
|
A_MAYBE_UNUSED AH2 qjG=wfG*s+wkG*v+u;
|
||||||
AH2 qjB=wfB*s+wkB*v+u;
|
A_MAYBE_UNUSED AH2 qjB=wfB*s+wkB*v+u;
|
||||||
AH2 qkR=wgR*t+wjR*u+v;
|
A_MAYBE_UNUSED AH2 qkR=wgR*t+wjR*u+v;
|
||||||
AH2 qkG=wgG*t+wjG*u+v;
|
A_MAYBE_UNUSED AH2 qkG=wgG*t+wjG*u+v;
|
||||||
AH2 qkB=wgB*t+wjB*u+v;
|
A_MAYBE_UNUSED AH2 qkB=wgB*t+wjB*u+v;
|
||||||
AH2 qinR=wjR*u;
|
A_MAYBE_UNUSED AH2 qinR=wjR*u;
|
||||||
AH2 qinG=wjG*u;
|
A_MAYBE_UNUSED AH2 qinG=wjG*u;
|
||||||
AH2 qinB=wjB*u;
|
A_MAYBE_UNUSED AH2 qinB=wjB*u;
|
||||||
AH2 qloR=wkR*v;
|
A_MAYBE_UNUSED AH2 qloR=wkR*v;
|
||||||
AH2 qloG=wkG*v;
|
A_MAYBE_UNUSED AH2 qloG=wkG*v;
|
||||||
AH2 qloB=wkB*v;
|
A_MAYBE_UNUSED AH2 qloB=wkB*v;
|
||||||
// Filter.
|
// Filter.
|
||||||
#ifndef CAS_SLOW
|
#ifndef CAS_SLOW
|
||||||
#ifdef CAS_GO_SLOWER
|
#ifdef CAS_GO_SLOWER
|
||||||
|
@ -1442,4 +1470,7 @@ A_STATIC void CasSetup(
|
||||||
pixB=ASatH2((bB*qbeB+eB*qbeB+cB*qchB+hB*qchB+iB*qinB+nB*qinB+lB*qloB+oB*qloB+fB*qfB+gB*qgB+jB*qjB+kB*qkB)*rcpWB);
|
pixB=ASatH2((bB*qbeB+eB*qbeB+cB*qchB+hB*qchB+iB*qinB+nB*qinB+lB*qloB+oB*qloB+fB*qfB+gB*qgB+jB*qjB+kB*qkB)*rcpWB);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#undef TEXINPUT
|
||||||
|
#undef TEXCALL
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue