gsdx-ogl-wnd: VS2010 doesn't support string bigger than 64k bytes. So remove PS3&360 shader from fxaa

git-svn-id: http://pcsx2.googlecode.com/svn/branches/gsdx-ogl-wnd@5664 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut 2013-06-15 10:06:47 +00:00
parent 19961175c9
commit adc232cb95
2 changed files with 0 additions and 1634 deletions

View File

@ -825,823 +825,6 @@ A. In the last opaque pass prior to FXAA,
/*============================================================================
FXAA3 CONSOLE - 360 PIXEL SHADER
------------------------------------------------------------------------------
Might be some optimizations left here,
as of this latest change didn't have a PIX dump to verify if TEX bound.
============================================================================*/
#if (FXAA_360 == 1)
/*--------------------------------------------------------------------------*/
half4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// {xy__} = upper left of pixel
// {__zw} = lower right of pixel
float4 posPos,
// {rgb_} = color in linear or perceptual color space
// {___a} = alpha output is junk value
FxaaTex tex,
// This must be from a constant/uniform.
// {xy} = rcpFrame not used on PC version of FXAA Console
float2 rcpFrame,
// This must be from a constant/uniform.
// {x___} = 2.0/screenWidthInPixels
// {_y__} = 2.0/screenHeightInPixels
// {__z_} = 0.5/screenWidthInPixels
// {___w} = 0.5/screenHeightInPixels
float4 rcpFrameOpt
) {
/*--------------------------------------------------------------------------*/
half4 lumaNwNeSwSe;
lumaNwNeSwSe.x = FxaaTexTop(tex, posPos.xy).w;
lumaNwNeSwSe.y = FxaaTexTop(tex, posPos.zy).w;
lumaNwNeSwSe.z = FxaaTexTop(tex, posPos.xw).w;
lumaNwNeSwSe.w = FxaaTexTop(tex, posPos.zw).w;
/*--------------------------------------------------------------------------*/
half4 rgbyM = FxaaTexTop(tex, pos.xy);
/*--------------------------------------------------------------------------*/
lumaNwNeSwSe.y += 1.0/384.0;
/*--------------------------------------------------------------------------*/
half2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
half2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
/*--------------------------------------------------------------------------*/
half lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);
half lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);
/*--------------------------------------------------------------------------*/
half lumaMinM = min(lumaMin, rgbyM.w);
half lumaMaxM = max(lumaMax, rgbyM.w);
if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD)) return rgbyM;
/*--------------------------------------------------------------------------*/
half2 dir;
dir.x = dot(lumaNwNeSwSe, float4(-1.0, -1.0, 1.0, 1.0));
dir.y = dot(lumaNwNeSwSe, float4( 1.0, -1.0, 1.0,-1.0));
/*--------------------------------------------------------------------------*/
half2 dir1;
dir1 = normalize(dir.xy);
/*--------------------------------------------------------------------------*/
half dirAbsMinTimesC = min(abs(dir1.x), abs(dir1.y)) * FXAA_CONSOLE_EDGE_SHARPNESS;
half2 dir2;
dir2 = clamp(dir1.xy / dirAbsMinTimesC, -2.0, 2.0);
/*--------------------------------------------------------------------------*/
half4 rgbyN1 = FxaaTexTop(tex, pos.xy - dir1 * rcpFrameOpt.zw);
half4 rgbyP1 = FxaaTexTop(tex, pos.xy + dir1 * rcpFrameOpt.zw);
half4 rgbyN2 = FxaaTexTop(tex, pos.xy - dir2 * rcpFrameOpt.xy);
half4 rgbyP2 = FxaaTexTop(tex, pos.xy + dir2 * rcpFrameOpt.xy);
/*--------------------------------------------------------------------------*/
half4 rgbyA = rgbyN1 * 0.5 + rgbyP1 * 0.5;
half4 rgbyB = rgbyN2 * 0.25 + rgbyP2 * 0.25 + rgbyA * 0.5;
/*--------------------------------------------------------------------------*/
bool twoTap = (rgbyB.w < lumaMin) || (rgbyB.w > lumaMax);
if(twoTap) rgbyB.xyz = rgbyA.xyz;
return rgbyB; }
/*==========================================================================*/
#endif
/*============================================================================
FXAA3 CONSOLE - 360 PIXEL SHADER OPTIMIZED PROTOTYPE
------------------------------------------------------------------------------
This prototype optimized version thanks to suggestions from Andy Luedke.
Should be fully tex bound in all cases.
As of the FXAA 3.10 release I have not tested this code,
but at least the missing ";" was fixed.
If it does not work, please let me know so I can fix it.
------------------------------------------------------------------------------
Extra requirements,
(1.) Different inputs: no posPos.
(2.) Different inputs: alias three samplers with different exp bias settings!
(3.) New constants: setup fxaaConst as described below.
============================================================================*/
#if (FXAA_360_OPT == 1)
/*--------------------------------------------------------------------------*/
[reduceTempRegUsage(4)]
float4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// Three samplers,
// texExpBias0 = exponent bias 0
// texExpBiasNeg1 = exponent bias -1
// texExpBiasNeg2 = exponent bias -2
// {rgb_} = color in linear or perceptual color space
// {___a} = alpha output is junk value
uniform sampler2D texExpBias0,
uniform sampler2D texExpBiasNeg1,
uniform sampler2D texExpBiasNeg2,
// These must be in physical constant registers and NOT immedates
// Immedates will result in compiler un-optimizing
// width = screen width in pixels
// height = screen height in pixels
fxaaConstDir, // float4(1.0, -1.0, 0.25, -0.25);
fxaaConstInner, // float4(0.5/width, 0.5/height, -0.5/width, -0.5/height);
fxaaConstOuter // float4(8.0/width, 8.0/height, -4.0/width, -4.0/height);
) {
/*--------------------------------------------------------------------------*/
float4 lumaNwNeSwSe;
asm {
tfetch2D lumaNwNeSwSe.w___, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false
tfetch2D lumaNwNeSwSe._w__, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false
tfetch2D lumaNwNeSwSe.__w_, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false
tfetch2D lumaNwNeSwSe.___w, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false
};
/*--------------------------------------------------------------------------*/
lumaNwNeSwSe.y += 1.0/384.0;
float2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
float2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
float lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);
float lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);
/*--------------------------------------------------------------------------*/
float4 rgbyM = tex2Dlod(texExpBias0, float4(pos.xy, 0.0, 0.0));
float4 lumaMinM = min(lumaMin, rgbyM.w);
float4 lumaMaxM = max(lumaMax, rgbyM.w);
if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD)) return rgbyM;
/*--------------------------------------------------------------------------*/
float2 dir;
dir.x = dot(lumaNwNeSwSe, fxaaConstDir.yyxx);
dir.y = dot(lumaNwNeSwSe, fxaaConstDir.xyxy);
dir = normalize(dir);
/*--------------------------------------------------------------------------*/
float4 dir1 = dir.xyxy * fxaaConstInner.xyzw;
/*--------------------------------------------------------------------------*/
float4 dir2;
float dirAbsMinTimesC = min(abs(dir.x), abs(dir.y));
dir2 = saturate(fxaaConstOuter.zzww * dir.xyxy / FXAA_CONSOLE_EDGE_SHARPNESS / dirAbsMinTimesC + 0.5);
dir2 = dir2 * fxaaConstOuter.xyxy + fxaaConstOuter.zwzw;
/*--------------------------------------------------------------------------*/
float4 rgbyN1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.xy, 0.0, 0.0));
float4 rgbyP1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.zw, 0.0, 0.0));
float4 rgbyN2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.xy, 0.0, 0.0));
float4 rgbyP2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.zw, 0.0, 0.0));
/*--------------------------------------------------------------------------*/
half4 rgbyA = rgbyN1 + rgbyP1;
half4 rgbyB = rgbyN2 + rgbyP2 * 0.5 + rgbyA;
/*--------------------------------------------------------------------------*/
float4 rgbyR = ((rgbyB.w - lumaMax) > 0.0) ? rgbyA : rgbyB;
rgbyR = ((rgbyB.w - lumaMin) > 0.0) ? rgbyR : rgbyA;
return rgbyR; }
/*==========================================================================*/
#endif
/*============================================================================
FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (NO EARLY EXIT)
==============================================================================
The code below does not exactly match the assembly.
I have a feeling that 12 cycles is possible, but was not able to get there.
Might have to increase register count to get full performance.
Note this shader does not use perspective interpolation.
Use the following cgc options,
--fenable-bx2 --fastmath --fastprecision --nofloatbindings
------------------------------------------------------------------------------
NVSHADERPERF OUTPUT
------------------------------------------------------------------------------
For reference and to aid in debug, output of NVShaderPerf should match this,
Shader to schedule:
0: texpkb h0.w(TRUE), v5.zyxx, #0
2: addh h2.z(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x
4: texpkb h0.w(TRUE), v5.xwxx, #0
6: addh h0.z(TRUE), -h2, h0.w
7: texpkb h1.w(TRUE), v5, #0
9: addh h0.x(TRUE), h0.z, -h1.w
10: addh h3.w(TRUE), h0.z, h1
11: texpkb h2.w(TRUE), v5.zwzz, #0
13: addh h0.z(TRUE), h3.w, -h2.w
14: addh h0.x(TRUE), h2.w, h0
15: nrmh h1.xz(TRUE), h0_n
16: minh_m8 h0.x(TRUE), |h1|, |h1.z|
17: maxh h4.w(TRUE), h0, h1
18: divx h2.xy(TRUE), h1_n.xzzw, h0_n
19: movr r1.zw(TRUE), v4.xxxy
20: madr r2.xz(TRUE), -h1, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zzww, r1.zzww
22: minh h5.w(TRUE), h0, h1
23: texpkb h0(TRUE), r2.xzxx, #0
25: madr r0.zw(TRUE), h1.xzxz, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w), r1
27: maxh h4.x(TRUE), h2.z, h2.w
28: texpkb h1(TRUE), r0.zwzz, #0
30: addh_d2 h1(TRUE), h0, h1
31: madr r0.xy(TRUE), -h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
33: texpkb h0(TRUE), r0, #0
35: minh h4.z(TRUE), h2, h2.w
36: fenct TRUE
37: madr r1.xy(TRUE), h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
39: texpkb h2(TRUE), r1, #0
41: addh_d2 h0(TRUE), h0, h2
42: maxh h2.w(TRUE), h4, h4.x
43: minh h2.x(TRUE), h5.w, h4.z
44: addh_d2 h0(TRUE), h0, h1
45: slth h2.x(TRUE), h0.w, h2
46: sgth h2.w(TRUE), h0, h2
47: movh h0(TRUE), h0
48: addx.c0 rc(TRUE), h2, h2.w
49: movh h0(c0.NE.x), h1
IPU0 ------ Simplified schedule: --------
Pass | Unit | uOp | PC: Op
-----+--------+------+-------------------------
1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
| TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
| SCB1 | add | 2: ADDh h2.z, h0.--w-, const.--x-;
| | |
2 | SCT0/1 | mov | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;
| TEX | txl | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;
| SCB1 | add | 6: ADDh h0.z,-h2, h0.--w-;
| | |
3 | SCT0/1 | mov | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0;
| TEX | txl | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0;
| SCB0 | add | 9: ADDh h0.x, h0.z---,-h1.w---;
| SCB1 | add | 10: ADDh h3.w, h0.---z, h1;
| | |
4 | SCT0/1 | mov | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
| TEX | txl | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
| SCB0 | add | 14: ADDh h0.x, h2.w---, h0;
| SCB1 | add | 13: ADDh h0.z, h3.--w-,-h2.--w-;
| | |
5 | SCT1 | mov | 15: NRMh h1.xz, h0;
| SRB | nrm | 15: NRMh h1.xz, h0;
| SCB0 | min | 16: MINh*8 h0.x, |h1|, |h1.z---|;
| SCB1 | max | 17: MAXh h4.w, h0, h1;
| | |
6 | SCT0 | div | 18: DIVx h2.xy, h1.xz--, h0;
| SCT1 | mov | 19: MOVr r1.zw, g[TEX0].--xy;
| SCB0 | mad | 20: MADr r2.xz,-h1, const.z-w-, r1.z-w-;
| SCB1 | min | 22: MINh h5.w, h0, h1;
| | |
7 | SCT0/1 | mov | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0;
| TEX | txl | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0;
| SCB0 | max | 27: MAXh h4.x, h2.z---, h2.w---;
| SCB1 | mad | 25: MADr r0.zw, h1.--xz, const, r1;
| | |
8 | SCT0/1 | mov | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0;
| TEX | txl | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0;
| SCB0/1 | add | 30: ADDh/2 h1, h0, h1;
| | |
9 | SCT0 | mad | 31: MADr r0.xy,-h2, const.xy--, r1.zw--;
| SCT1 | mov | 33: TXLr h0, r0, const.zzzz, TEX0;
| TEX | txl | 33: TXLr h0, r0, const.zzzz, TEX0;
| SCB1 | min | 35: MINh h4.z, h2, h2.--w-;
| | |
10 | SCT0 | mad | 37: MADr r1.xy, h2, const.xy--, r1.zw--;
| SCT1 | mov | 39: TXLr h2, r1, const.zzzz, TEX0;
| TEX | txl | 39: TXLr h2, r1, const.zzzz, TEX0;
| SCB0/1 | add | 41: ADDh/2 h0, h0, h2;
| | |
11 | SCT0 | min | 43: MINh h2.x, h5.w---, h4.z---;
| SCT1 | max | 42: MAXh h2.w, h4, h4.---x;
| SCB0/1 | add | 44: ADDh/2 h0, h0, h1;
| | |
12 | SCT0 | set | 45: SLTh h2.x, h0.w---, h2;
| SCT1 | set | 46: SGTh h2.w, h0, h2;
| SCB0/1 | mul | 47: MOVh h0, h0;
| | |
13 | SCT0 | mad | 48: ADDxc0_s rc, h2, h2.w---;
| SCB0/1 | mul | 49: MOVh h0(NE0.xxxx), h1;
Pass SCT TEX SCB
1: 0% 100% 25%
2: 0% 100% 25%
3: 0% 100% 50%
4: 0% 100% 50%
5: 0% 0% 50%
6: 100% 0% 75%
7: 0% 100% 75%
8: 0% 100% 100%
9: 0% 100% 25%
10: 0% 100% 100%
11: 50% 0% 100%
12: 50% 0% 100%
13: 25% 0% 100%
MEAN: 17% 61% 67%
Pass SCT0 SCT1 TEX SCB0 SCB1
1: 0% 0% 100% 0% 100%
2: 0% 0% 100% 0% 100%
3: 0% 0% 100% 100% 100%
4: 0% 0% 100% 100% 100%
5: 0% 0% 0% 100% 100%
6: 100% 100% 0% 100% 100%
7: 0% 0% 100% 100% 100%
8: 0% 0% 100% 100% 100%
9: 0% 0% 100% 0% 100%
10: 0% 0% 100% 100% 100%
11: 100% 100% 0% 100% 100%
12: 100% 100% 0% 100% 100%
13: 100% 0% 0% 100% 100%
MEAN: 30% 23% 61% 76% 100%
Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5
Results 13 cycles, 3 r regs, 923,076,923 pixels/s
============================================================================*/
#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 0)
/*--------------------------------------------------------------------------*/
#pragma disablepc all
#pragma option O3
#pragma option OutColorPrec=fp16
#pragma texformat default RGBA8
/*==========================================================================*/
half4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// {xy__} = upper left of pixel
// {__zw} = lower right of pixel
float4 posPos,
// {rgb_} = color in linear or perceptual color space
// {___a} = luma in perceptual color space (not linear)
sampler2D tex,
// This must be from a constant/uniform.
// {xy} = rcpFrame not used on PS3
float2 rcpFrame,
// This must be from a constant/uniform.
// {x___} = 2.0/screenWidthInPixels
// {_y__} = 2.0/screenHeightInPixels
// {__z_} = 0.5/screenWidthInPixels
// {___w} = 0.5/screenHeightInPixels
float4 rcpFrameOpt
) {
/*--------------------------------------------------------------------------*/
// (1)
half4 dir;
half4 lumaNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0));
lumaNe.w += half(1.0/512.0);
dir.x = -lumaNe.w;
dir.z = -lumaNe.w;
/*--------------------------------------------------------------------------*/
// (2)
half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0));
dir.x += lumaSw.w;
dir.z += lumaSw.w;
/*--------------------------------------------------------------------------*/
// (3)
half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0));
dir.x -= lumaNw.w;
dir.z += lumaNw.w;
/*--------------------------------------------------------------------------*/
// (4)
half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0));
dir.x += lumaSe.w;
dir.z -= lumaSe.w;
/*--------------------------------------------------------------------------*/
// (5)
half4 dir1_pos;
dir1_pos.xy = normalize(dir.xyz).xz;
half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);
/*--------------------------------------------------------------------------*/
// (6)
half4 dir2_pos;
dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));
dir1_pos.zw = pos.xy;
dir2_pos.zw = pos.xy;
half4 temp1N;
temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;
/*--------------------------------------------------------------------------*/
// (7)
temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));
half4 rgby1;
rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;
/*--------------------------------------------------------------------------*/
// (8)
rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));
rgby1 = (temp1N + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
// (9)
half4 temp2N;
temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;
temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));
/*--------------------------------------------------------------------------*/
// (10)
half4 rgby2;
rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;
rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));
rgby2 = (temp2N + rgby2) * 0.5;
/*--------------------------------------------------------------------------*/
// (11)
// compilier moves these scalar ops up to other cycles
half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));
half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));
rgby2 = (rgby2 + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
// (12)
bool twoTapLt = rgby2.w < lumaMin;
bool twoTapGt = rgby2.w > lumaMax;
/*--------------------------------------------------------------------------*/
// (13)
if(twoTapLt || twoTapGt) rgby2 = rgby1;
/*--------------------------------------------------------------------------*/
return rgby2; }
/*==========================================================================*/
#endif
/*============================================================================
FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (WITH EARLY EXIT)
==============================================================================
The code mostly matches the assembly.
I have a feeling that 14 cycles is possible, but was not able to get there.
Might have to increase register count to get full performance.
Note this shader does not use perspective interpolation.
Use the following cgc options,
--fenable-bx2 --fastmath --fastprecision --nofloatbindings
------------------------------------------------------------------------------
NVSHADERPERF OUTPUT
------------------------------------------------------------------------------
For reference and to aid in debug, output of NVShaderPerf should match this,
Shader to schedule:
0: texpkb h0.w(TRUE), v5.zyxx, #0
2: addh h2.y(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x
4: texpkb h1.w(TRUE), v5.xwxx, #0
6: addh h0.x(TRUE), h1.w, -h2.y
7: texpkb h2.w(TRUE), v5.zwzz, #0
9: minh h4.w(TRUE), h2.y, h2
10: maxh h5.x(TRUE), h2.y, h2.w
11: texpkb h0.w(TRUE), v5, #0
13: addh h3.w(TRUE), -h0, h0.x
14: addh h0.x(TRUE), h0.w, h0
15: addh h0.z(TRUE), -h2.w, h0.x
16: addh h0.x(TRUE), h2.w, h3.w
17: minh h5.y(TRUE), h0.w, h1.w
18: nrmh h2.xz(TRUE), h0_n
19: minh_m8 h2.w(TRUE), |h2.x|, |h2.z|
20: divx h4.xy(TRUE), h2_n.xzzw, h2_n.w
21: movr r1.zw(TRUE), v4.xxxy
22: maxh h2.w(TRUE), h0, h1
23: fenct TRUE
24: madr r0.xy(TRUE), -h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz
26: texpkb h0(TRUE), r0, #0
28: maxh h5.x(TRUE), h2.w, h5
29: minh h5.w(TRUE), h5.y, h4
30: madr r1.xy(TRUE), h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz
32: texpkb h2(TRUE), r1, #0
34: addh_d2 h2(TRUE), h0, h2
35: texpkb h1(TRUE), v4, #0
37: maxh h5.y(TRUE), h5.x, h1.w
38: minh h4.w(TRUE), h1, h5
39: madr r0.xy(TRUE), -h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
41: texpkb h0(TRUE), r0, #0
43: addh_m8 h5.z(TRUE), h5.y, -h4.w
44: madr r2.xy(TRUE), h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
46: texpkb h3(TRUE), r2, #0
48: addh_d2 h0(TRUE), h0, h3
49: addh_d2 h3(TRUE), h0, h2
50: movh h0(TRUE), h3
51: slth h3.x(TRUE), h3.w, h5.w
52: sgth h3.w(TRUE), h3, h5.x
53: addx.c0 rc(TRUE), h3.x, h3
54: slth.c0 rc(TRUE), h5.z, h5
55: movh h0(c0.NE.w), h2
56: movh h0(c0.NE.x), h1
IPU0 ------ Simplified schedule: --------
Pass | Unit | uOp | PC: Op
-----+--------+------+-------------------------
1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
| TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
| SCB0 | add | 2: ADDh h2.y, h0.-w--, const.-x--;
| | |
2 | SCT0/1 | mov | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;
| TEX | txl | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;
| SCB0 | add | 6: ADDh h0.x, h1.w---,-h2.y---;
| | |
3 | SCT0/1 | mov | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
| TEX | txl | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
| SCB0 | max | 10: MAXh h5.x, h2.y---, h2.w---;
| SCB1 | min | 9: MINh h4.w, h2.---y, h2;
| | |
4 | SCT0/1 | mov | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0;
| TEX | txl | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0;
| SCB0 | add | 14: ADDh h0.x, h0.w---, h0;
| SCB1 | add | 13: ADDh h3.w,-h0, h0.---x;
| | |
5 | SCT0 | mad | 16: ADDh h0.x, h2.w---, h3.w---;
| SCT1 | mad | 15: ADDh h0.z,-h2.--w-, h0.--x-;
| SCB0 | min | 17: MINh h5.y, h0.-w--, h1.-w--;
| | |
6 | SCT1 | mov | 18: NRMh h2.xz, h0;
| SRB | nrm | 18: NRMh h2.xz, h0;
| SCB1 | min | 19: MINh*8 h2.w, |h2.---x|, |h2.---z|;
| | |
7 | SCT0 | div | 20: DIVx h4.xy, h2.xz--, h2.ww--;
| SCT1 | mov | 21: MOVr r1.zw, g[TEX0].--xy;
| SCB1 | max | 22: MAXh h2.w, h0, h1;
| | |
8 | SCT0 | mad | 24: MADr r0.xy,-h2.xz--, const.zw--, r1.zw--;
| SCT1 | mov | 26: TXLr h0, r0, const.xxxx, TEX0;
| TEX | txl | 26: TXLr h0, r0, const.xxxx, TEX0;
| SCB0 | max | 28: MAXh h5.x, h2.w---, h5;
| SCB1 | min | 29: MINh h5.w, h5.---y, h4;
| | |
9 | SCT0 | mad | 30: MADr r1.xy, h2.xz--, const.zw--, r1.zw--;
| SCT1 | mov | 32: TXLr h2, r1, const.xxxx, TEX0;
| TEX | txl | 32: TXLr h2, r1, const.xxxx, TEX0;
| SCB0/1 | add | 34: ADDh/2 h2, h0, h2;
| | |
10 | SCT0/1 | mov | 35: TXLr h1, g[TEX0], const.xxxx, TEX0;
| TEX | txl | 35: TXLr h1, g[TEX0], const.xxxx, TEX0;
| SCB0 | max | 37: MAXh h5.y, h5.-x--, h1.-w--;
| SCB1 | min | 38: MINh h4.w, h1, h5;
| | |
11 | SCT0 | mad | 39: MADr r0.xy,-h4, const.xy--, r1.zw--;
| SCT1 | mov | 41: TXLr h0, r0, const.zzzz, TEX0;
| TEX | txl | 41: TXLr h0, r0, const.zzzz, TEX0;
| SCB0 | mad | 44: MADr r2.xy, h4, const.xy--, r1.zw--;
| SCB1 | add | 43: ADDh*8 h5.z, h5.--y-,-h4.--w-;
| | |
12 | SCT0/1 | mov | 46: TXLr h3, r2, const.xxxx, TEX0;
| TEX | txl | 46: TXLr h3, r2, const.xxxx, TEX0;
| SCB0/1 | add | 48: ADDh/2 h0, h0, h3;
| | |
13 | SCT0/1 | mad | 49: ADDh/2 h3, h0, h2;
| SCB0/1 | mul | 50: MOVh h0, h3;
| | |
14 | SCT0 | set | 51: SLTh h3.x, h3.w---, h5.w---;
| SCT1 | set | 52: SGTh h3.w, h3, h5.---x;
| SCB0 | set | 54: SLThc0 rc, h5.z---, h5;
| SCB1 | add | 53: ADDxc0_s rc, h3.---x, h3;
| | |
15 | SCT0/1 | mul | 55: MOVh h0(NE0.wwww), h2;
| SCB0/1 | mul | 56: MOVh h0(NE0.xxxx), h1;
Pass SCT TEX SCB
1: 0% 100% 25%
2: 0% 100% 25%
3: 0% 100% 50%
4: 0% 100% 50%
5: 50% 0% 25%
6: 0% 0% 25%
7: 100% 0% 25%
8: 0% 100% 50%
9: 0% 100% 100%
10: 0% 100% 50%
11: 0% 100% 75%
12: 0% 100% 100%
13: 100% 0% 100%
14: 50% 0% 50%
15: 100% 0% 100%
MEAN: 26% 60% 56%
Pass SCT0 SCT1 TEX SCB0 SCB1
1: 0% 0% 100% 100% 0%
2: 0% 0% 100% 100% 0%
3: 0% 0% 100% 100% 100%
4: 0% 0% 100% 100% 100%
5: 100% 100% 0% 100% 0%
6: 0% 0% 0% 0% 100%
7: 100% 100% 0% 0% 100%
8: 0% 0% 100% 100% 100%
9: 0% 0% 100% 100% 100%
10: 0% 0% 100% 100% 100%
11: 0% 0% 100% 100% 100%
12: 0% 0% 100% 100% 100%
13: 100% 100% 0% 100% 100%
14: 100% 100% 0% 100% 100%
15: 100% 100% 0% 100% 100%
MEAN: 33% 33% 60% 86% 80%
Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5
Results 15 cycles, 3 r regs, 800,000,000 pixels/s
============================================================================*/
#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 1)
/*--------------------------------------------------------------------------*/
#pragma disablepc all
#pragma option O2
#pragma option OutColorPrec=fp16
#pragma texformat default RGBA8
/*==========================================================================*/
half4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// {xy__} = upper left of pixel
// {__zw} = lower right of pixel
float4 posPos,
// {rgb_} = color in linear or perceptual color space
// {___a} = luma in perceptual color space (not linear)
sampler2D tex,
// This must be from a constant/uniform.
// {xy} = rcpFrame not used on PS3
float2 rcpFrame,
// This must be from a constant/uniform.
// {x___} = 2.0/screenWidthInPixels
// {_y__} = 2.0/screenHeightInPixels
// {__z_} = 0.5/screenWidthInPixels
// {___w} = 0.5/screenHeightInPixels
float4 rcpFrameOpt
) {
/*--------------------------------------------------------------------------*/
// (1)
half4 rgbyNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0));
half lumaNe = rgbyNe.w + half(1.0/512.0);
/*--------------------------------------------------------------------------*/
// (2)
half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0));
half lumaSwNegNe = lumaSw.w - lumaNe;
/*--------------------------------------------------------------------------*/
// (3)
half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0));
half lumaMaxNwSw = max(lumaNw.w, lumaSw.w);
half lumaMinNwSw = min(lumaNw.w, lumaSw.w);
/*--------------------------------------------------------------------------*/
// (4)
half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0));
half dirZ = lumaNw.w + lumaSwNegNe;
half dirX = -lumaNw.w + lumaSwNegNe;
/*--------------------------------------------------------------------------*/
// (5)
half3 dir;
dir.y = 0.0;
dir.x = lumaSe.w + dirX;
dir.z = -lumaSe.w + dirZ;
half lumaMinNeSe = min(lumaNe, lumaSe.w);
/*--------------------------------------------------------------------------*/
// (6)
half4 dir1_pos;
dir1_pos.xy = normalize(dir).xz;
half dirAbsMinTimes8 = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);
/*--------------------------------------------------------------------------*/
// (7)
half4 dir2_pos;
dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimes8, half(-2.0), half(2.0));
dir1_pos.zw = pos.xy;
dir2_pos.zw = pos.xy;
half lumaMaxNeSe = max(lumaNe, lumaSe.w);
/*--------------------------------------------------------------------------*/
// (8)
half4 temp1N;
temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;
temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));
half lumaMax = max(lumaMaxNwSw, lumaMaxNeSe);
half lumaMin = min(lumaMinNwSw, lumaMinNeSe);
/*--------------------------------------------------------------------------*/
// (9)
half4 rgby1;
rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;
rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));
rgby1 = (temp1N + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
// (10)
half4 rgbyM = h4tex2Dlod(tex, half4(pos.xy, 0.0, 0.0));
half lumaMaxM = max(lumaMax, rgbyM.w);
half lumaMinM = min(lumaMin, rgbyM.w);
/*--------------------------------------------------------------------------*/
// (11)
half4 temp2N;
temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;
temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));
half4 rgby2;
rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;
half lumaRangeM = (lumaMaxM - lumaMinM) / FXAA_CONSOLE_EDGE_THRESHOLD;
/*--------------------------------------------------------------------------*/
// (12)
rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));
rgby2 = (temp2N + rgby2) * 0.5;
/*--------------------------------------------------------------------------*/
// (13)
rgby2 = (rgby2 + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
// (14)
bool twoTapLt = rgby2.w < lumaMin;
bool twoTapGt = rgby2.w > lumaMax;
bool earlyExit = lumaRangeM < lumaMax;
bool twoTap = twoTapLt || twoTapGt;
/*--------------------------------------------------------------------------*/
// (15)
if(twoTap) rgby2 = rgby1;
if(earlyExit) rgby2 = rgbyM;
/*--------------------------------------------------------------------------*/
return rgby2; }
/*==========================================================================*/
#endif
/*============================================================================
FXAA3 CONSOLE - PC PIXEL SHADER
------------------------------------------------------------------------------
Using a modified version of the PS3 version here to best target old hardware.
============================================================================*/
#if (FXAA_PC_CONSOLE == 1)
/*--------------------------------------------------------------------------*/
half4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// {xy__} = upper left of pixel
// {__zw} = lower right of pixel
float4 posPos,
// {rgb_} = color in linear or perceptual color space
// {___a} = alpha output is junk value
FxaaTex tex,
// This must be from a constant/uniform.
// {xy} = rcpFrame not used on PC version of FXAA Console
float2 rcpFrame,
// This must be from a constant/uniform.
// {x___} = 2.0/screenWidthInPixels
// {_y__} = 2.0/screenHeightInPixels
// {__z_} = 0.5/screenWidthInPixels
// {___w} = 0.5/screenHeightInPixels
float4 rcpFrameOpt
) {
/*--------------------------------------------------------------------------*/
half4 dir;
dir.y = 0.0;
half4 lumaNe = FxaaTexTop(tex, posPos.zy);
lumaNe.w += half(1.0/384.0);
dir.x = -lumaNe.w;
dir.z = -lumaNe.w;
/*--------------------------------------------------------------------------*/
half4 lumaSw = FxaaTexTop(tex, posPos.xw);
dir.x += lumaSw.w;
dir.z += lumaSw.w;
/*--------------------------------------------------------------------------*/
half4 lumaNw = FxaaTexTop(tex, posPos.xy);
dir.x -= lumaNw.w;
dir.z += lumaNw.w;
/*--------------------------------------------------------------------------*/
half4 lumaSe = FxaaTexTop(tex, posPos.zw);
dir.x += lumaSe.w;
dir.z -= lumaSe.w;
/*==========================================================================*/
#if (FXAA_EARLY_EXIT == 1)
half4 rgbyM = FxaaTexTop(tex, pos.xy);
/*--------------------------------------------------------------------------*/
half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));
half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));
/*--------------------------------------------------------------------------*/
half lumaMinM = min(lumaMin, rgbyM.w);
half lumaMaxM = max(lumaMax, rgbyM.w);
/*--------------------------------------------------------------------------*/
if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD))
#if (FXAA_DISCARD == 1)
FxaaDiscard;
#else
return rgbyM;
#endif
#endif
/*==========================================================================*/
half4 dir1_pos;
dir1_pos.xy = normalize(dir.xyz).xz;
half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);
/*--------------------------------------------------------------------------*/
half4 dir2_pos;
dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));
dir1_pos.zw = pos.xy;
dir2_pos.zw = pos.xy;
half4 temp1N;
temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;
/*--------------------------------------------------------------------------*/
temp1N = FxaaTexTop(tex, temp1N.xy);
half4 rgby1;
rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;
/*--------------------------------------------------------------------------*/
rgby1 = FxaaTexTop(tex, rgby1.xy);
rgby1 = (temp1N + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
half4 temp2N;
temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;
temp2N = FxaaTexTop(tex, temp2N.xy);
/*--------------------------------------------------------------------------*/
half4 rgby2;
rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;
rgby2 = FxaaTexTop(tex, rgby2.xy);
rgby2 = (temp2N + rgby2) * 0.5;
/*--------------------------------------------------------------------------*/
#if (FXAA_EARLY_EXIT == 0)
half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));
half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));
#endif
rgby2 = (rgby2 + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
bool twoTapLt = rgby2.w < lumaMin;
bool twoTapGt = rgby2.w > lumaMax;
/*--------------------------------------------------------------------------*/
if(twoTapLt || twoTapGt) rgby2 = rgby1;
/*--------------------------------------------------------------------------*/
return rgby2; }
/*==========================================================================*/
#endif
/*============================================================================
FXAA3 QUALITY - PC

View File

@ -855,823 +855,6 @@ static const char* fxaa_fx =
"\n"
"/*============================================================================\n"
"\n"
" FXAA3 CONSOLE - 360 PIXEL SHADER\n"
"\n"
"------------------------------------------------------------------------------\n"
"Might be some optimizations left here,\n"
"as of this latest change didn't have a PIX dump to verify if TEX bound.\n"
"============================================================================*/\n"
"#if (FXAA_360 == 1)\n"
"/*--------------------------------------------------------------------------*/\n"
"half4 FxaaPixelShader(\n"
" // {xy} = center of pixel\n"
" float2 pos,\n"
" // {xy__} = upper left of pixel\n"
" // {__zw} = lower right of pixel\n"
" float4 posPos,\n"
" // {rgb_} = color in linear or perceptual color space\n"
" // {___a} = alpha output is junk value\n"
" FxaaTex tex,\n"
" // This must be from a constant/uniform.\n"
" // {xy} = rcpFrame not used on PC version of FXAA Console\n"
" float2 rcpFrame,\n"
" // This must be from a constant/uniform.\n"
" // {x___} = 2.0/screenWidthInPixels\n"
" // {_y__} = 2.0/screenHeightInPixels\n"
" // {__z_} = 0.5/screenWidthInPixels\n"
" // {___w} = 0.5/screenHeightInPixels\n"
" float4 rcpFrameOpt\n"
") {\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 lumaNwNeSwSe;\n"
" lumaNwNeSwSe.x = FxaaTexTop(tex, posPos.xy).w;\n"
" lumaNwNeSwSe.y = FxaaTexTop(tex, posPos.zy).w;\n"
" lumaNwNeSwSe.z = FxaaTexTop(tex, posPos.xw).w;\n"
" lumaNwNeSwSe.w = FxaaTexTop(tex, posPos.zw).w;\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 rgbyM = FxaaTexTop(tex, pos.xy);\n"
"/*--------------------------------------------------------------------------*/\n"
" lumaNwNeSwSe.y += 1.0/384.0;\n"
"/*--------------------------------------------------------------------------*/\n"
" half2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);\n"
" half2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);\n"
"/*--------------------------------------------------------------------------*/\n"
" half lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);\n"
" half lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);\n"
"/*--------------------------------------------------------------------------*/\n"
" half lumaMinM = min(lumaMin, rgbyM.w);\n"
" half lumaMaxM = max(lumaMax, rgbyM.w);\n"
" if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD)) return rgbyM;\n"
"/*--------------------------------------------------------------------------*/\n"
" half2 dir;\n"
" dir.x = dot(lumaNwNeSwSe, float4(-1.0, -1.0, 1.0, 1.0));\n"
" dir.y = dot(lumaNwNeSwSe, float4( 1.0, -1.0, 1.0,-1.0));\n"
"/*--------------------------------------------------------------------------*/\n"
" half2 dir1;\n"
" dir1 = normalize(dir.xy);\n"
"/*--------------------------------------------------------------------------*/\n"
" half dirAbsMinTimesC = min(abs(dir1.x), abs(dir1.y)) * FXAA_CONSOLE_EDGE_SHARPNESS;\n"
" half2 dir2;\n"
" dir2 = clamp(dir1.xy / dirAbsMinTimesC, -2.0, 2.0);\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 rgbyN1 = FxaaTexTop(tex, pos.xy - dir1 * rcpFrameOpt.zw);\n"
" half4 rgbyP1 = FxaaTexTop(tex, pos.xy + dir1 * rcpFrameOpt.zw);\n"
" half4 rgbyN2 = FxaaTexTop(tex, pos.xy - dir2 * rcpFrameOpt.xy);\n"
" half4 rgbyP2 = FxaaTexTop(tex, pos.xy + dir2 * rcpFrameOpt.xy);\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 rgbyA = rgbyN1 * 0.5 + rgbyP1 * 0.5;\n"
" half4 rgbyB = rgbyN2 * 0.25 + rgbyP2 * 0.25 + rgbyA * 0.5;\n"
"/*--------------------------------------------------------------------------*/\n"
" bool twoTap = (rgbyB.w < lumaMin) || (rgbyB.w > lumaMax);\n"
" if(twoTap) rgbyB.xyz = rgbyA.xyz;\n"
" return rgbyB; }\n"
"/*==========================================================================*/\n"
"#endif\n"
"\n"
"\n"
"\n"
"/*============================================================================\n"
"\n"
" FXAA3 CONSOLE - 360 PIXEL SHADER OPTIMIZED PROTOTYPE\n"
"\n"
"------------------------------------------------------------------------------\n"
"This prototype optimized version thanks to suggestions from Andy Luedke.\n"
"Should be fully tex bound in all cases.\n"
"As of the FXAA 3.10 release I have not tested this code,\n"
"but at least the missing \";\" was fixed.\n"
"If it does not work, please let me know so I can fix it.\n"
"------------------------------------------------------------------------------\n"
"Extra requirements,\n"
"(1.) Different inputs: no posPos.\n"
"(2.) Different inputs: alias three samplers with different exp bias settings!\n"
"(3.) New constants: setup fxaaConst as described below.\n"
"============================================================================*/\n"
"#if (FXAA_360_OPT == 1)\n"
"/*--------------------------------------------------------------------------*/\n"
"[reduceTempRegUsage(4)]\n"
"float4 FxaaPixelShader(\n"
" // {xy} = center of pixel\n"
" float2 pos,\n"
" // Three samplers,\n"
" // texExpBias0 = exponent bias 0\n"
" // texExpBiasNeg1 = exponent bias -1\n"
" // texExpBiasNeg2 = exponent bias -2\n"
" // {rgb_} = color in linear or perceptual color space\n"
" // {___a} = alpha output is junk value\n"
" uniform sampler2D texExpBias0,\n"
" uniform sampler2D texExpBiasNeg1,\n"
" uniform sampler2D texExpBiasNeg2,\n"
" // These must be in physical constant registers and NOT immedates\n"
" // Immedates will result in compiler un-optimizing\n"
" // width = screen width in pixels\n"
" // height = screen height in pixels\n"
" fxaaConstDir, // float4(1.0, -1.0, 0.25, -0.25);\n"
" fxaaConstInner, // float4(0.5/width, 0.5/height, -0.5/width, -0.5/height);\n"
" fxaaConstOuter // float4(8.0/width, 8.0/height, -4.0/width, -4.0/height);\n"
") {\n"
"/*--------------------------------------------------------------------------*/\n"
" float4 lumaNwNeSwSe;\n"
" asm { \n"
" tfetch2D lumaNwNeSwSe.w___, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false\n"
" tfetch2D lumaNwNeSwSe._w__, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false\n"
" tfetch2D lumaNwNeSwSe.__w_, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false\n"
" tfetch2D lumaNwNeSwSe.___w, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false\n"
" };\n"
"/*--------------------------------------------------------------------------*/\n"
" lumaNwNeSwSe.y += 1.0/384.0;\n"
" float2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);\n"
" float2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);\n"
" float lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);\n"
" float lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);\n"
"/*--------------------------------------------------------------------------*/\n"
" float4 rgbyM = tex2Dlod(texExpBias0, float4(pos.xy, 0.0, 0.0));\n"
" float4 lumaMinM = min(lumaMin, rgbyM.w);\n"
" float4 lumaMaxM = max(lumaMax, rgbyM.w);\n"
" if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD)) return rgbyM;\n"
"/*--------------------------------------------------------------------------*/\n"
" float2 dir;\n"
" dir.x = dot(lumaNwNeSwSe, fxaaConstDir.yyxx);\n"
" dir.y = dot(lumaNwNeSwSe, fxaaConstDir.xyxy);\n"
" dir = normalize(dir);\n"
"/*--------------------------------------------------------------------------*/\n"
" float4 dir1 = dir.xyxy * fxaaConstInner.xyzw;\n"
"/*--------------------------------------------------------------------------*/\n"
" float4 dir2;\n"
" float dirAbsMinTimesC = min(abs(dir.x), abs(dir.y));\n"
" dir2 = saturate(fxaaConstOuter.zzww * dir.xyxy / FXAA_CONSOLE_EDGE_SHARPNESS / dirAbsMinTimesC + 0.5);\n"
" dir2 = dir2 * fxaaConstOuter.xyxy + fxaaConstOuter.zwzw;\n"
"/*--------------------------------------------------------------------------*/\n"
" float4 rgbyN1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.xy, 0.0, 0.0));\n"
" float4 rgbyP1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.zw, 0.0, 0.0));\n"
" float4 rgbyN2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.xy, 0.0, 0.0));\n"
" float4 rgbyP2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.zw, 0.0, 0.0));\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 rgbyA = rgbyN1 + rgbyP1;\n"
" half4 rgbyB = rgbyN2 + rgbyP2 * 0.5 + rgbyA;\n"
"/*--------------------------------------------------------------------------*/\n"
" float4 rgbyR = ((rgbyB.w - lumaMax) > 0.0) ? rgbyA : rgbyB;\n"
" rgbyR = ((rgbyB.w - lumaMin) > 0.0) ? rgbyR : rgbyA;\n"
" return rgbyR; }\n"
"/*==========================================================================*/\n"
"#endif\n"
"\n"
"\n"
"\n"
"\n"
"/*============================================================================\n"
"\n"
" FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (NO EARLY EXIT)\n"
"\n"
"==============================================================================\n"
"The code below does not exactly match the assembly.\n"
"I have a feeling that 12 cycles is possible, but was not able to get there.\n"
"Might have to increase register count to get full performance.\n"
"Note this shader does not use perspective interpolation.\n"
"\n"
"Use the following cgc options,\n"
"\n"
" --fenable-bx2 --fastmath --fastprecision --nofloatbindings\n"
"\n"
"------------------------------------------------------------------------------\n"
" NVSHADERPERF OUTPUT\n"
"------------------------------------------------------------------------------\n"
"For reference and to aid in debug, output of NVShaderPerf should match this,\n"
"\n"
"Shader to schedule:\n"
" 0: texpkb h0.w(TRUE), v5.zyxx, #0\n"
" 2: addh h2.z(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x\n"
" 4: texpkb h0.w(TRUE), v5.xwxx, #0\n"
" 6: addh h0.z(TRUE), -h2, h0.w\n"
" 7: texpkb h1.w(TRUE), v5, #0\n"
" 9: addh h0.x(TRUE), h0.z, -h1.w\n"
" 10: addh h3.w(TRUE), h0.z, h1\n"
" 11: texpkb h2.w(TRUE), v5.zwzz, #0\n"
" 13: addh h0.z(TRUE), h3.w, -h2.w\n"
" 14: addh h0.x(TRUE), h2.w, h0\n"
" 15: nrmh h1.xz(TRUE), h0_n\n"
" 16: minh_m8 h0.x(TRUE), |h1|, |h1.z|\n"
" 17: maxh h4.w(TRUE), h0, h1\n"
" 18: divx h2.xy(TRUE), h1_n.xzzw, h0_n\n"
" 19: movr r1.zw(TRUE), v4.xxxy\n"
" 20: madr r2.xz(TRUE), -h1, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zzww, r1.zzww\n"
" 22: minh h5.w(TRUE), h0, h1\n"
" 23: texpkb h0(TRUE), r2.xzxx, #0\n"
" 25: madr r0.zw(TRUE), h1.xzxz, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w), r1\n"
" 27: maxh h4.x(TRUE), h2.z, h2.w\n"
" 28: texpkb h1(TRUE), r0.zwzz, #0\n"
" 30: addh_d2 h1(TRUE), h0, h1\n"
" 31: madr r0.xy(TRUE), -h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz\n"
" 33: texpkb h0(TRUE), r0, #0\n"
" 35: minh h4.z(TRUE), h2, h2.w\n"
" 36: fenct TRUE\n"
" 37: madr r1.xy(TRUE), h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz\n"
" 39: texpkb h2(TRUE), r1, #0\n"
" 41: addh_d2 h0(TRUE), h0, h2\n"
" 42: maxh h2.w(TRUE), h4, h4.x\n"
" 43: minh h2.x(TRUE), h5.w, h4.z\n"
" 44: addh_d2 h0(TRUE), h0, h1\n"
" 45: slth h2.x(TRUE), h0.w, h2\n"
" 46: sgth h2.w(TRUE), h0, h2\n"
" 47: movh h0(TRUE), h0\n"
" 48: addx.c0 rc(TRUE), h2, h2.w\n"
" 49: movh h0(c0.NE.x), h1\n"
"\n"
"IPU0 ------ Simplified schedule: --------\n"
"Pass | Unit | uOp | PC: Op\n"
"-----+--------+------+-------------------------\n"
" 1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;\n"
" | TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;\n"
" | SCB1 | add | 2: ADDh h2.z, h0.--w-, const.--x-;\n"
" | | |\n"
" 2 | SCT0/1 | mov | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;\n"
" | TEX | txl | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;\n"
" | SCB1 | add | 6: ADDh h0.z,-h2, h0.--w-;\n"
" | | |\n"
" 3 | SCT0/1 | mov | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0;\n"
" | TEX | txl | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0;\n"
" | SCB0 | add | 9: ADDh h0.x, h0.z---,-h1.w---;\n"
" | SCB1 | add | 10: ADDh h3.w, h0.---z, h1;\n"
" | | |\n"
" 4 | SCT0/1 | mov | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;\n"
" | TEX | txl | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;\n"
" | SCB0 | add | 14: ADDh h0.x, h2.w---, h0;\n"
" | SCB1 | add | 13: ADDh h0.z, h3.--w-,-h2.--w-;\n"
" | | |\n"
" 5 | SCT1 | mov | 15: NRMh h1.xz, h0;\n"
" | SRB | nrm | 15: NRMh h1.xz, h0;\n"
" | SCB0 | min | 16: MINh*8 h0.x, |h1|, |h1.z---|;\n"
" | SCB1 | max | 17: MAXh h4.w, h0, h1;\n"
" | | |\n"
" 6 | SCT0 | div | 18: DIVx h2.xy, h1.xz--, h0;\n"
" | SCT1 | mov | 19: MOVr r1.zw, g[TEX0].--xy;\n"
" | SCB0 | mad | 20: MADr r2.xz,-h1, const.z-w-, r1.z-w-;\n"
" | SCB1 | min | 22: MINh h5.w, h0, h1;\n"
" | | |\n"
" 7 | SCT0/1 | mov | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0;\n"
" | TEX | txl | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0;\n"
" | SCB0 | max | 27: MAXh h4.x, h2.z---, h2.w---;\n"
" | SCB1 | mad | 25: MADr r0.zw, h1.--xz, const, r1;\n"
" | | |\n"
" 8 | SCT0/1 | mov | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0;\n"
" | TEX | txl | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0;\n"
" | SCB0/1 | add | 30: ADDh/2 h1, h0, h1;\n"
" | | |\n"
" 9 | SCT0 | mad | 31: MADr r0.xy,-h2, const.xy--, r1.zw--;\n"
" | SCT1 | mov | 33: TXLr h0, r0, const.zzzz, TEX0;\n"
" | TEX | txl | 33: TXLr h0, r0, const.zzzz, TEX0;\n"
" | SCB1 | min | 35: MINh h4.z, h2, h2.--w-;\n"
" | | |\n"
" 10 | SCT0 | mad | 37: MADr r1.xy, h2, const.xy--, r1.zw--;\n"
" | SCT1 | mov | 39: TXLr h2, r1, const.zzzz, TEX0;\n"
" | TEX | txl | 39: TXLr h2, r1, const.zzzz, TEX0;\n"
" | SCB0/1 | add | 41: ADDh/2 h0, h0, h2;\n"
" | | |\n"
" 11 | SCT0 | min | 43: MINh h2.x, h5.w---, h4.z---;\n"
" | SCT1 | max | 42: MAXh h2.w, h4, h4.---x;\n"
" | SCB0/1 | add | 44: ADDh/2 h0, h0, h1;\n"
" | | |\n"
" 12 | SCT0 | set | 45: SLTh h2.x, h0.w---, h2;\n"
" | SCT1 | set | 46: SGTh h2.w, h0, h2;\n"
" | SCB0/1 | mul | 47: MOVh h0, h0;\n"
" | | |\n"
" 13 | SCT0 | mad | 48: ADDxc0_s rc, h2, h2.w---;\n"
" | SCB0/1 | mul | 49: MOVh h0(NE0.xxxx), h1;\n"
" \n"
"Pass SCT TEX SCB\n"
" 1: 0\% 100\% 25\%\n"
" 2: 0\% 100\% 25\%\n"
" 3: 0\% 100\% 50\%\n"
" 4: 0\% 100\% 50\%\n"
" 5: 0\% 0\% 50\%\n"
" 6: 100\% 0\% 75\%\n"
" 7: 0\% 100\% 75\%\n"
" 8: 0\% 100\% 100\%\n"
" 9: 0\% 100\% 25\%\n"
" 10: 0\% 100\% 100\%\n"
" 11: 50\% 0\% 100\%\n"
" 12: 50\% 0\% 100\%\n"
" 13: 25\% 0\% 100\%\n"
"\n"
"MEAN: 17\% 61\% 67\%\n"
"\n"
"Pass SCT0 SCT1 TEX SCB0 SCB1\n"
" 1: 0\% 0\% 100\% 0\% 100\%\n"
" 2: 0\% 0\% 100\% 0\% 100\%\n"
" 3: 0\% 0\% 100\% 100\% 100\%\n"
" 4: 0\% 0\% 100\% 100\% 100\%\n"
" 5: 0\% 0\% 0\% 100\% 100\%\n"
" 6: 100\% 100\% 0\% 100\% 100\%\n"
" 7: 0\% 0\% 100\% 100\% 100\%\n"
" 8: 0\% 0\% 100\% 100\% 100\%\n"
" 9: 0\% 0\% 100\% 0\% 100\%\n"
" 10: 0\% 0\% 100\% 100\% 100\%\n"
" 11: 100\% 100\% 0\% 100\% 100\%\n"
" 12: 100\% 100\% 0\% 100\% 100\%\n"
" 13: 100\% 0\% 0\% 100\% 100\%\n"
"\n"
"MEAN: 30\% 23\% 61\% 76\% 100\%\n"
"Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5\n"
"Results 13 cycles, 3 r regs, 923,076,923 pixels/s\n"
"============================================================================*/\n"
"#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 0)\n"
"/*--------------------------------------------------------------------------*/\n"
"#pragma disablepc all\n"
"#pragma option O3\n"
"#pragma option OutColorPrec=fp16\n"
"#pragma texformat default RGBA8\n"
"/*==========================================================================*/\n"
"half4 FxaaPixelShader(\n"
" // {xy} = center of pixel\n"
" float2 pos,\n"
" // {xy__} = upper left of pixel\n"
" // {__zw} = lower right of pixel\n"
" float4 posPos,\n"
" // {rgb_} = color in linear or perceptual color space\n"
" // {___a} = luma in perceptual color space (not linear)\n"
" sampler2D tex,\n"
" // This must be from a constant/uniform.\n"
" // {xy} = rcpFrame not used on PS3\n"
" float2 rcpFrame,\n"
" // This must be from a constant/uniform.\n"
" // {x___} = 2.0/screenWidthInPixels\n"
" // {_y__} = 2.0/screenHeightInPixels\n"
" // {__z_} = 0.5/screenWidthInPixels\n"
" // {___w} = 0.5/screenHeightInPixels\n"
" float4 rcpFrameOpt\n"
") {\n"
"/*--------------------------------------------------------------------------*/\n"
"// (1)\n"
" half4 dir;\n"
" half4 lumaNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0));\n"
" lumaNe.w += half(1.0/512.0);\n"
" dir.x = -lumaNe.w;\n"
" dir.z = -lumaNe.w;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (2)\n"
" half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0));\n"
" dir.x += lumaSw.w;\n"
" dir.z += lumaSw.w;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (3)\n"
" half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0));\n"
" dir.x -= lumaNw.w;\n"
" dir.z += lumaNw.w;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (4)\n"
" half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0));\n"
" dir.x += lumaSe.w;\n"
" dir.z -= lumaSe.w;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (5)\n"
" half4 dir1_pos;\n"
" dir1_pos.xy = normalize(dir.xyz).xz;\n"
" half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);\n"
"/*--------------------------------------------------------------------------*/\n"
"// (6)\n"
" half4 dir2_pos;\n"
" dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));\n"
" dir1_pos.zw = pos.xy;\n"
" dir2_pos.zw = pos.xy;\n"
" half4 temp1N;\n"
" temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (7)\n"
" temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));\n"
" half4 rgby1;\n"
" rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (8)\n"
" rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));\n"
" rgby1 = (temp1N + rgby1) * 0.5;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (9)\n"
" half4 temp2N;\n"
" temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;\n"
" temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));\n"
"/*--------------------------------------------------------------------------*/\n"
"// (10)\n"
" half4 rgby2;\n"
" rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;\n"
" rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));\n"
" rgby2 = (temp2N + rgby2) * 0.5;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (11)\n"
" // compilier moves these scalar ops up to other cycles\n"
" half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));\n"
" half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));\n"
" rgby2 = (rgby2 + rgby1) * 0.5;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (12)\n"
" bool twoTapLt = rgby2.w < lumaMin;\n"
" bool twoTapGt = rgby2.w > lumaMax;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (13)\n"
" if(twoTapLt || twoTapGt) rgby2 = rgby1;\n"
"/*--------------------------------------------------------------------------*/\n"
" return rgby2; }\n"
"/*==========================================================================*/\n"
"#endif\n"
"\n"
"\n"
"\n"
"/*============================================================================\n"
"\n"
" FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (WITH EARLY EXIT)\n"
"\n"
"==============================================================================\n"
"The code mostly matches the assembly.\n"
"I have a feeling that 14 cycles is possible, but was not able to get there.\n"
"Might have to increase register count to get full performance.\n"
"Note this shader does not use perspective interpolation.\n"
"\n"
"Use the following cgc options,\n"
"\n"
" --fenable-bx2 --fastmath --fastprecision --nofloatbindings\n"
"\n"
"------------------------------------------------------------------------------\n"
" NVSHADERPERF OUTPUT\n"
"------------------------------------------------------------------------------\n"
"For reference and to aid in debug, output of NVShaderPerf should match this,\n"
"\n"
"Shader to schedule:\n"
" 0: texpkb h0.w(TRUE), v5.zyxx, #0\n"
" 2: addh h2.y(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x\n"
" 4: texpkb h1.w(TRUE), v5.xwxx, #0\n"
" 6: addh h0.x(TRUE), h1.w, -h2.y\n"
" 7: texpkb h2.w(TRUE), v5.zwzz, #0\n"
" 9: minh h4.w(TRUE), h2.y, h2\n"
" 10: maxh h5.x(TRUE), h2.y, h2.w\n"
" 11: texpkb h0.w(TRUE), v5, #0\n"
" 13: addh h3.w(TRUE), -h0, h0.x\n"
" 14: addh h0.x(TRUE), h0.w, h0\n"
" 15: addh h0.z(TRUE), -h2.w, h0.x\n"
" 16: addh h0.x(TRUE), h2.w, h3.w\n"
" 17: minh h5.y(TRUE), h0.w, h1.w\n"
" 18: nrmh h2.xz(TRUE), h0_n\n"
" 19: minh_m8 h2.w(TRUE), |h2.x|, |h2.z|\n"
" 20: divx h4.xy(TRUE), h2_n.xzzw, h2_n.w\n"
" 21: movr r1.zw(TRUE), v4.xxxy\n"
" 22: maxh h2.w(TRUE), h0, h1\n"
" 23: fenct TRUE\n"
" 24: madr r0.xy(TRUE), -h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz\n"
" 26: texpkb h0(TRUE), r0, #0\n"
" 28: maxh h5.x(TRUE), h2.w, h5\n"
" 29: minh h5.w(TRUE), h5.y, h4\n"
" 30: madr r1.xy(TRUE), h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz\n"
" 32: texpkb h2(TRUE), r1, #0\n"
" 34: addh_d2 h2(TRUE), h0, h2\n"
" 35: texpkb h1(TRUE), v4, #0\n"
" 37: maxh h5.y(TRUE), h5.x, h1.w\n"
" 38: minh h4.w(TRUE), h1, h5\n"
" 39: madr r0.xy(TRUE), -h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz\n"
" 41: texpkb h0(TRUE), r0, #0\n"
" 43: addh_m8 h5.z(TRUE), h5.y, -h4.w\n"
" 44: madr r2.xy(TRUE), h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz\n"
" 46: texpkb h3(TRUE), r2, #0\n"
" 48: addh_d2 h0(TRUE), h0, h3\n"
" 49: addh_d2 h3(TRUE), h0, h2\n"
" 50: movh h0(TRUE), h3\n"
" 51: slth h3.x(TRUE), h3.w, h5.w\n"
" 52: sgth h3.w(TRUE), h3, h5.x\n"
" 53: addx.c0 rc(TRUE), h3.x, h3\n"
" 54: slth.c0 rc(TRUE), h5.z, h5\n"
" 55: movh h0(c0.NE.w), h2\n"
" 56: movh h0(c0.NE.x), h1\n"
"\n"
"IPU0 ------ Simplified schedule: --------\n"
"Pass | Unit | uOp | PC: Op\n"
"-----+--------+------+-------------------------\n"
" 1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;\n"
" | TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;\n"
" | SCB0 | add | 2: ADDh h2.y, h0.-w--, const.-x--;\n"
" | | |\n"
" 2 | SCT0/1 | mov | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;\n"
" | TEX | txl | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;\n"
" | SCB0 | add | 6: ADDh h0.x, h1.w---,-h2.y---;\n"
" | | |\n"
" 3 | SCT0/1 | mov | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;\n"
" | TEX | txl | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;\n"
" | SCB0 | max | 10: MAXh h5.x, h2.y---, h2.w---;\n"
" | SCB1 | min | 9: MINh h4.w, h2.---y, h2;\n"
" | | |\n"
" 4 | SCT0/1 | mov | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0;\n"
" | TEX | txl | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0;\n"
" | SCB0 | add | 14: ADDh h0.x, h0.w---, h0;\n"
" | SCB1 | add | 13: ADDh h3.w,-h0, h0.---x;\n"
" | | |\n"
" 5 | SCT0 | mad | 16: ADDh h0.x, h2.w---, h3.w---;\n"
" | SCT1 | mad | 15: ADDh h0.z,-h2.--w-, h0.--x-;\n"
" | SCB0 | min | 17: MINh h5.y, h0.-w--, h1.-w--;\n"
" | | |\n"
" 6 | SCT1 | mov | 18: NRMh h2.xz, h0;\n"
" | SRB | nrm | 18: NRMh h2.xz, h0;\n"
" | SCB1 | min | 19: MINh*8 h2.w, |h2.---x|, |h2.---z|;\n"
" | | |\n"
" 7 | SCT0 | div | 20: DIVx h4.xy, h2.xz--, h2.ww--;\n"
" | SCT1 | mov | 21: MOVr r1.zw, g[TEX0].--xy;\n"
" | SCB1 | max | 22: MAXh h2.w, h0, h1;\n"
" | | |\n"
" 8 | SCT0 | mad | 24: MADr r0.xy,-h2.xz--, const.zw--, r1.zw--;\n"
" | SCT1 | mov | 26: TXLr h0, r0, const.xxxx, TEX0;\n"
" | TEX | txl | 26: TXLr h0, r0, const.xxxx, TEX0;\n"
" | SCB0 | max | 28: MAXh h5.x, h2.w---, h5;\n"
" | SCB1 | min | 29: MINh h5.w, h5.---y, h4;\n"
" | | |\n"
" 9 | SCT0 | mad | 30: MADr r1.xy, h2.xz--, const.zw--, r1.zw--;\n"
" | SCT1 | mov | 32: TXLr h2, r1, const.xxxx, TEX0;\n"
" | TEX | txl | 32: TXLr h2, r1, const.xxxx, TEX0;\n"
" | SCB0/1 | add | 34: ADDh/2 h2, h0, h2;\n"
" | | |\n"
" 10 | SCT0/1 | mov | 35: TXLr h1, g[TEX0], const.xxxx, TEX0;\n"
" | TEX | txl | 35: TXLr h1, g[TEX0], const.xxxx, TEX0;\n"
" | SCB0 | max | 37: MAXh h5.y, h5.-x--, h1.-w--;\n"
" | SCB1 | min | 38: MINh h4.w, h1, h5;\n"
" | | |\n"
" 11 | SCT0 | mad | 39: MADr r0.xy,-h4, const.xy--, r1.zw--;\n"
" | SCT1 | mov | 41: TXLr h0, r0, const.zzzz, TEX0;\n"
" | TEX | txl | 41: TXLr h0, r0, const.zzzz, TEX0;\n"
" | SCB0 | mad | 44: MADr r2.xy, h4, const.xy--, r1.zw--;\n"
" | SCB1 | add | 43: ADDh*8 h5.z, h5.--y-,-h4.--w-;\n"
" | | |\n"
" 12 | SCT0/1 | mov | 46: TXLr h3, r2, const.xxxx, TEX0;\n"
" | TEX | txl | 46: TXLr h3, r2, const.xxxx, TEX0;\n"
" | SCB0/1 | add | 48: ADDh/2 h0, h0, h3;\n"
" | | |\n"
" 13 | SCT0/1 | mad | 49: ADDh/2 h3, h0, h2;\n"
" | SCB0/1 | mul | 50: MOVh h0, h3;\n"
" | | |\n"
" 14 | SCT0 | set | 51: SLTh h3.x, h3.w---, h5.w---;\n"
" | SCT1 | set | 52: SGTh h3.w, h3, h5.---x;\n"
" | SCB0 | set | 54: SLThc0 rc, h5.z---, h5;\n"
" | SCB1 | add | 53: ADDxc0_s rc, h3.---x, h3;\n"
" | | |\n"
" 15 | SCT0/1 | mul | 55: MOVh h0(NE0.wwww), h2;\n"
" | SCB0/1 | mul | 56: MOVh h0(NE0.xxxx), h1;\n"
" \n"
"Pass SCT TEX SCB\n"
" 1: 0\% 100\% 25\%\n"
" 2: 0\% 100\% 25\%\n"
" 3: 0\% 100\% 50\%\n"
" 4: 0\% 100\% 50\%\n"
" 5: 50\% 0\% 25\%\n"
" 6: 0\% 0\% 25\%\n"
" 7: 100\% 0\% 25\%\n"
" 8: 0\% 100\% 50\%\n"
" 9: 0\% 100\% 100\%\n"
" 10: 0\% 100\% 50\%\n"
" 11: 0\% 100\% 75\%\n"
" 12: 0\% 100\% 100\%\n"
" 13: 100\% 0\% 100\%\n"
" 14: 50\% 0\% 50\%\n"
" 15: 100\% 0\% 100\%\n"
"\n"
"MEAN: 26\% 60\% 56\%\n"
"\n"
"Pass SCT0 SCT1 TEX SCB0 SCB1\n"
" 1: 0\% 0\% 100\% 100\% 0\%\n"
" 2: 0\% 0\% 100\% 100\% 0\%\n"
" 3: 0\% 0\% 100\% 100\% 100\%\n"
" 4: 0\% 0\% 100\% 100\% 100\%\n"
" 5: 100\% 100\% 0\% 100\% 0\%\n"
" 6: 0\% 0\% 0\% 0\% 100\%\n"
" 7: 100\% 100\% 0\% 0\% 100\%\n"
" 8: 0\% 0\% 100\% 100\% 100\%\n"
" 9: 0\% 0\% 100\% 100\% 100\%\n"
" 10: 0\% 0\% 100\% 100\% 100\%\n"
" 11: 0\% 0\% 100\% 100\% 100\%\n"
" 12: 0\% 0\% 100\% 100\% 100\%\n"
" 13: 100\% 100\% 0\% 100\% 100\%\n"
" 14: 100\% 100\% 0\% 100\% 100\%\n"
" 15: 100\% 100\% 0\% 100\% 100\%\n"
"\n"
"MEAN: 33\% 33\% 60\% 86\% 80\%\n"
"Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5\n"
"Results 15 cycles, 3 r regs, 800,000,000 pixels/s\n"
"============================================================================*/\n"
"#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 1)\n"
"/*--------------------------------------------------------------------------*/\n"
"#pragma disablepc all\n"
"#pragma option O2\n"
"#pragma option OutColorPrec=fp16\n"
"#pragma texformat default RGBA8\n"
"/*==========================================================================*/\n"
"half4 FxaaPixelShader(\n"
" // {xy} = center of pixel\n"
" float2 pos,\n"
" // {xy__} = upper left of pixel\n"
" // {__zw} = lower right of pixel\n"
" float4 posPos,\n"
" // {rgb_} = color in linear or perceptual color space\n"
" // {___a} = luma in perceptual color space (not linear)\n"
" sampler2D tex,\n"
" // This must be from a constant/uniform.\n"
" // {xy} = rcpFrame not used on PS3\n"
" float2 rcpFrame,\n"
" // This must be from a constant/uniform.\n"
" // {x___} = 2.0/screenWidthInPixels\n"
" // {_y__} = 2.0/screenHeightInPixels\n"
" // {__z_} = 0.5/screenWidthInPixels\n"
" // {___w} = 0.5/screenHeightInPixels\n"
" float4 rcpFrameOpt\n"
") {\n"
"/*--------------------------------------------------------------------------*/\n"
"// (1)\n"
" half4 rgbyNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0));\n"
" half lumaNe = rgbyNe.w + half(1.0/512.0);\n"
"/*--------------------------------------------------------------------------*/\n"
"// (2)\n"
" half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0));\n"
" half lumaSwNegNe = lumaSw.w - lumaNe;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (3)\n"
" half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0));\n"
" half lumaMaxNwSw = max(lumaNw.w, lumaSw.w);\n"
" half lumaMinNwSw = min(lumaNw.w, lumaSw.w);\n"
"/*--------------------------------------------------------------------------*/\n"
"// (4)\n"
" half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0));\n"
" half dirZ = lumaNw.w + lumaSwNegNe;\n"
" half dirX = -lumaNw.w + lumaSwNegNe;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (5)\n"
" half3 dir;\n"
" dir.y = 0.0;\n"
" dir.x = lumaSe.w + dirX;\n"
" dir.z = -lumaSe.w + dirZ;\n"
" half lumaMinNeSe = min(lumaNe, lumaSe.w);\n"
"/*--------------------------------------------------------------------------*/\n"
"// (6)\n"
" half4 dir1_pos;\n"
" dir1_pos.xy = normalize(dir).xz;\n"
" half dirAbsMinTimes8 = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);\n"
"/*--------------------------------------------------------------------------*/\n"
"// (7)\n"
" half4 dir2_pos;\n"
" dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimes8, half(-2.0), half(2.0));\n"
" dir1_pos.zw = pos.xy;\n"
" dir2_pos.zw = pos.xy;\n"
" half lumaMaxNeSe = max(lumaNe, lumaSe.w);\n"
"/*--------------------------------------------------------------------------*/\n"
"// (8)\n"
" half4 temp1N;\n"
" temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;\n"
" temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));\n"
" half lumaMax = max(lumaMaxNwSw, lumaMaxNeSe);\n"
" half lumaMin = min(lumaMinNwSw, lumaMinNeSe);\n"
"/*--------------------------------------------------------------------------*/\n"
"// (9)\n"
" half4 rgby1;\n"
" rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;\n"
" rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));\n"
" rgby1 = (temp1N + rgby1) * 0.5;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (10)\n"
" half4 rgbyM = h4tex2Dlod(tex, half4(pos.xy, 0.0, 0.0));\n"
" half lumaMaxM = max(lumaMax, rgbyM.w);\n"
" half lumaMinM = min(lumaMin, rgbyM.w);\n"
"/*--------------------------------------------------------------------------*/\n"
"// (11)\n"
" half4 temp2N;\n"
" temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;\n"
" temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));\n"
" half4 rgby2;\n"
" rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;\n"
" half lumaRangeM = (lumaMaxM - lumaMinM) / FXAA_CONSOLE_EDGE_THRESHOLD;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (12)\n"
" rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));\n"
" rgby2 = (temp2N + rgby2) * 0.5;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (13)\n"
" rgby2 = (rgby2 + rgby1) * 0.5;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (14)\n"
" bool twoTapLt = rgby2.w < lumaMin;\n"
" bool twoTapGt = rgby2.w > lumaMax;\n"
" bool earlyExit = lumaRangeM < lumaMax;\n"
" bool twoTap = twoTapLt || twoTapGt;\n"
"/*--------------------------------------------------------------------------*/\n"
"// (15)\n"
" if(twoTap) rgby2 = rgby1;\n"
" if(earlyExit) rgby2 = rgbyM;\n"
"/*--------------------------------------------------------------------------*/\n"
" return rgby2; }\n"
"/*==========================================================================*/\n"
"#endif\n"
"\n"
"\n"
"\n"
"/*============================================================================\n"
"\n"
" FXAA3 CONSOLE - PC PIXEL SHADER\n"
"\n"
"------------------------------------------------------------------------------\n"
"Using a modified version of the PS3 version here to best target old hardware.\n"
"============================================================================*/\n"
"#if (FXAA_PC_CONSOLE == 1)\n"
"/*--------------------------------------------------------------------------*/\n"
"half4 FxaaPixelShader(\n"
" // {xy} = center of pixel\n"
" float2 pos,\n"
" // {xy__} = upper left of pixel\n"
" // {__zw} = lower right of pixel\n"
" float4 posPos,\n"
" // {rgb_} = color in linear or perceptual color space\n"
" // {___a} = alpha output is junk value\n"
" FxaaTex tex,\n"
" // This must be from a constant/uniform.\n"
" // {xy} = rcpFrame not used on PC version of FXAA Console\n"
" float2 rcpFrame,\n"
" // This must be from a constant/uniform.\n"
" // {x___} = 2.0/screenWidthInPixels\n"
" // {_y__} = 2.0/screenHeightInPixels\n"
" // {__z_} = 0.5/screenWidthInPixels\n"
" // {___w} = 0.5/screenHeightInPixels\n"
" float4 rcpFrameOpt\n"
") {\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 dir;\n"
" dir.y = 0.0;\n"
" half4 lumaNe = FxaaTexTop(tex, posPos.zy);\n"
" lumaNe.w += half(1.0/384.0);\n"
" dir.x = -lumaNe.w;\n"
" dir.z = -lumaNe.w;\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 lumaSw = FxaaTexTop(tex, posPos.xw);\n"
" dir.x += lumaSw.w;\n"
" dir.z += lumaSw.w;\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 lumaNw = FxaaTexTop(tex, posPos.xy);\n"
" dir.x -= lumaNw.w;\n"
" dir.z += lumaNw.w;\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 lumaSe = FxaaTexTop(tex, posPos.zw);\n"
" dir.x += lumaSe.w;\n"
" dir.z -= lumaSe.w;\n"
"/*==========================================================================*/\n"
" #if (FXAA_EARLY_EXIT == 1)\n"
" half4 rgbyM = FxaaTexTop(tex, pos.xy);\n"
"/*--------------------------------------------------------------------------*/\n"
" half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));\n"
" half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));\n"
"/*--------------------------------------------------------------------------*/\n"
" half lumaMinM = min(lumaMin, rgbyM.w);\n"
" half lumaMaxM = max(lumaMax, rgbyM.w);\n"
"/*--------------------------------------------------------------------------*/\n"
" if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD))\n"
" #if (FXAA_DISCARD == 1)\n"
" FxaaDiscard;\n"
" #else\n"
" return rgbyM;\n"
" #endif\n"
" #endif\n"
"/*==========================================================================*/\n"
" half4 dir1_pos;\n"
" dir1_pos.xy = normalize(dir.xyz).xz;\n"
" half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 dir2_pos;\n"
" dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));\n"
" dir1_pos.zw = pos.xy;\n"
" dir2_pos.zw = pos.xy;\n"
" half4 temp1N;\n"
" temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;\n"
"/*--------------------------------------------------------------------------*/\n"
" temp1N = FxaaTexTop(tex, temp1N.xy);\n"
" half4 rgby1;\n"
" rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;\n"
"/*--------------------------------------------------------------------------*/\n"
" rgby1 = FxaaTexTop(tex, rgby1.xy);\n"
" rgby1 = (temp1N + rgby1) * 0.5;\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 temp2N;\n"
" temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;\n"
" temp2N = FxaaTexTop(tex, temp2N.xy);\n"
"/*--------------------------------------------------------------------------*/\n"
" half4 rgby2;\n"
" rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;\n"
" rgby2 = FxaaTexTop(tex, rgby2.xy);\n"
" rgby2 = (temp2N + rgby2) * 0.5;\n"
"/*--------------------------------------------------------------------------*/\n"
" #if (FXAA_EARLY_EXIT == 0)\n"
" half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));\n"
" half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));\n"
" #endif\n"
" rgby2 = (rgby2 + rgby1) * 0.5;\n"
"/*--------------------------------------------------------------------------*/\n"
" bool twoTapLt = rgby2.w < lumaMin;\n"
" bool twoTapGt = rgby2.w > lumaMax;\n"
"/*--------------------------------------------------------------------------*/\n"
" if(twoTapLt || twoTapGt) rgby2 = rgby1;\n"
"/*--------------------------------------------------------------------------*/\n"
" return rgby2; }\n"
"/*==========================================================================*/\n"
"#endif\n"
"\n"
"\n"
"\n"
"/*============================================================================\n"
"\n"
" FXAA3 QUALITY - PC\n"
"\n"
"============================================================================*/\n"