mirror of https://github.com/PCSX2/pcsx2.git
gsdx-ogl-wnd: VS2010 doesn't support string bigger than 64k bytes. So remove PS3&360 shader from fxaa
git-svn-id: http://pcsx2.googlecode.com/svn/branches/gsdx-ogl-wnd@5664 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
19961175c9
commit
adc232cb95
|
@ -825,823 +825,6 @@ A. In the last opaque pass prior to FXAA,
|
|||
|
||||
|
||||
|
||||
/*============================================================================
|
||||
|
||||
FXAA3 CONSOLE - 360 PIXEL SHADER
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
Might be some optimizations left here,
|
||||
as of this latest change didn't have a PIX dump to verify if TEX bound.
|
||||
============================================================================*/
|
||||
#if (FXAA_360 == 1)
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 FxaaPixelShader(
|
||||
// {xy} = center of pixel
|
||||
float2 pos,
|
||||
// {xy__} = upper left of pixel
|
||||
// {__zw} = lower right of pixel
|
||||
float4 posPos,
|
||||
// {rgb_} = color in linear or perceptual color space
|
||||
// {___a} = alpha output is junk value
|
||||
FxaaTex tex,
|
||||
// This must be from a constant/uniform.
|
||||
// {xy} = rcpFrame not used on PC version of FXAA Console
|
||||
float2 rcpFrame,
|
||||
// This must be from a constant/uniform.
|
||||
// {x___} = 2.0/screenWidthInPixels
|
||||
// {_y__} = 2.0/screenHeightInPixels
|
||||
// {__z_} = 0.5/screenWidthInPixels
|
||||
// {___w} = 0.5/screenHeightInPixels
|
||||
float4 rcpFrameOpt
|
||||
) {
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 lumaNwNeSwSe;
|
||||
lumaNwNeSwSe.x = FxaaTexTop(tex, posPos.xy).w;
|
||||
lumaNwNeSwSe.y = FxaaTexTop(tex, posPos.zy).w;
|
||||
lumaNwNeSwSe.z = FxaaTexTop(tex, posPos.xw).w;
|
||||
lumaNwNeSwSe.w = FxaaTexTop(tex, posPos.zw).w;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 rgbyM = FxaaTexTop(tex, pos.xy);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
lumaNwNeSwSe.y += 1.0/384.0;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
|
||||
half2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);
|
||||
half lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half lumaMinM = min(lumaMin, rgbyM.w);
|
||||
half lumaMaxM = max(lumaMax, rgbyM.w);
|
||||
if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD)) return rgbyM;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half2 dir;
|
||||
dir.x = dot(lumaNwNeSwSe, float4(-1.0, -1.0, 1.0, 1.0));
|
||||
dir.y = dot(lumaNwNeSwSe, float4( 1.0, -1.0, 1.0,-1.0));
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half2 dir1;
|
||||
dir1 = normalize(dir.xy);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half dirAbsMinTimesC = min(abs(dir1.x), abs(dir1.y)) * FXAA_CONSOLE_EDGE_SHARPNESS;
|
||||
half2 dir2;
|
||||
dir2 = clamp(dir1.xy / dirAbsMinTimesC, -2.0, 2.0);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 rgbyN1 = FxaaTexTop(tex, pos.xy - dir1 * rcpFrameOpt.zw);
|
||||
half4 rgbyP1 = FxaaTexTop(tex, pos.xy + dir1 * rcpFrameOpt.zw);
|
||||
half4 rgbyN2 = FxaaTexTop(tex, pos.xy - dir2 * rcpFrameOpt.xy);
|
||||
half4 rgbyP2 = FxaaTexTop(tex, pos.xy + dir2 * rcpFrameOpt.xy);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 rgbyA = rgbyN1 * 0.5 + rgbyP1 * 0.5;
|
||||
half4 rgbyB = rgbyN2 * 0.25 + rgbyP2 * 0.25 + rgbyA * 0.5;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
bool twoTap = (rgbyB.w < lumaMin) || (rgbyB.w > lumaMax);
|
||||
if(twoTap) rgbyB.xyz = rgbyA.xyz;
|
||||
return rgbyB; }
|
||||
/*==========================================================================*/
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*============================================================================
|
||||
|
||||
FXAA3 CONSOLE - 360 PIXEL SHADER OPTIMIZED PROTOTYPE
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
This prototype optimized version thanks to suggestions from Andy Luedke.
|
||||
Should be fully tex bound in all cases.
|
||||
As of the FXAA 3.10 release I have not tested this code,
|
||||
but at least the missing ";" was fixed.
|
||||
If it does not work, please let me know so I can fix it.
|
||||
------------------------------------------------------------------------------
|
||||
Extra requirements,
|
||||
(1.) Different inputs: no posPos.
|
||||
(2.) Different inputs: alias three samplers with different exp bias settings!
|
||||
(3.) New constants: setup fxaaConst as described below.
|
||||
============================================================================*/
|
||||
#if (FXAA_360_OPT == 1)
|
||||
/*--------------------------------------------------------------------------*/
|
||||
[reduceTempRegUsage(4)]
|
||||
float4 FxaaPixelShader(
|
||||
// {xy} = center of pixel
|
||||
float2 pos,
|
||||
// Three samplers,
|
||||
// texExpBias0 = exponent bias 0
|
||||
// texExpBiasNeg1 = exponent bias -1
|
||||
// texExpBiasNeg2 = exponent bias -2
|
||||
// {rgb_} = color in linear or perceptual color space
|
||||
// {___a} = alpha output is junk value
|
||||
uniform sampler2D texExpBias0,
|
||||
uniform sampler2D texExpBiasNeg1,
|
||||
uniform sampler2D texExpBiasNeg2,
|
||||
// These must be in physical constant registers and NOT immedates
|
||||
// Immedates will result in compiler un-optimizing
|
||||
// width = screen width in pixels
|
||||
// height = screen height in pixels
|
||||
fxaaConstDir, // float4(1.0, -1.0, 0.25, -0.25);
|
||||
fxaaConstInner, // float4(0.5/width, 0.5/height, -0.5/width, -0.5/height);
|
||||
fxaaConstOuter // float4(8.0/width, 8.0/height, -4.0/width, -4.0/height);
|
||||
) {
|
||||
/*--------------------------------------------------------------------------*/
|
||||
float4 lumaNwNeSwSe;
|
||||
asm {
|
||||
tfetch2D lumaNwNeSwSe.w___, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false
|
||||
tfetch2D lumaNwNeSwSe._w__, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false
|
||||
tfetch2D lumaNwNeSwSe.__w_, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false
|
||||
tfetch2D lumaNwNeSwSe.___w, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false
|
||||
};
|
||||
/*--------------------------------------------------------------------------*/
|
||||
lumaNwNeSwSe.y += 1.0/384.0;
|
||||
float2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
|
||||
float2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
|
||||
float lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);
|
||||
float lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
float4 rgbyM = tex2Dlod(texExpBias0, float4(pos.xy, 0.0, 0.0));
|
||||
float4 lumaMinM = min(lumaMin, rgbyM.w);
|
||||
float4 lumaMaxM = max(lumaMax, rgbyM.w);
|
||||
if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD)) return rgbyM;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
float2 dir;
|
||||
dir.x = dot(lumaNwNeSwSe, fxaaConstDir.yyxx);
|
||||
dir.y = dot(lumaNwNeSwSe, fxaaConstDir.xyxy);
|
||||
dir = normalize(dir);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
float4 dir1 = dir.xyxy * fxaaConstInner.xyzw;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
float4 dir2;
|
||||
float dirAbsMinTimesC = min(abs(dir.x), abs(dir.y));
|
||||
dir2 = saturate(fxaaConstOuter.zzww * dir.xyxy / FXAA_CONSOLE_EDGE_SHARPNESS / dirAbsMinTimesC + 0.5);
|
||||
dir2 = dir2 * fxaaConstOuter.xyxy + fxaaConstOuter.zwzw;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
float4 rgbyN1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.xy, 0.0, 0.0));
|
||||
float4 rgbyP1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.zw, 0.0, 0.0));
|
||||
float4 rgbyN2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.xy, 0.0, 0.0));
|
||||
float4 rgbyP2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.zw, 0.0, 0.0));
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 rgbyA = rgbyN1 + rgbyP1;
|
||||
half4 rgbyB = rgbyN2 + rgbyP2 * 0.5 + rgbyA;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
float4 rgbyR = ((rgbyB.w - lumaMax) > 0.0) ? rgbyA : rgbyB;
|
||||
rgbyR = ((rgbyB.w - lumaMin) > 0.0) ? rgbyR : rgbyA;
|
||||
return rgbyR; }
|
||||
/*==========================================================================*/
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
/*============================================================================
|
||||
|
||||
FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (NO EARLY EXIT)
|
||||
|
||||
==============================================================================
|
||||
The code below does not exactly match the assembly.
|
||||
I have a feeling that 12 cycles is possible, but was not able to get there.
|
||||
Might have to increase register count to get full performance.
|
||||
Note this shader does not use perspective interpolation.
|
||||
|
||||
Use the following cgc options,
|
||||
|
||||
--fenable-bx2 --fastmath --fastprecision --nofloatbindings
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
NVSHADERPERF OUTPUT
|
||||
------------------------------------------------------------------------------
|
||||
For reference and to aid in debug, output of NVShaderPerf should match this,
|
||||
|
||||
Shader to schedule:
|
||||
0: texpkb h0.w(TRUE), v5.zyxx, #0
|
||||
2: addh h2.z(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x
|
||||
4: texpkb h0.w(TRUE), v5.xwxx, #0
|
||||
6: addh h0.z(TRUE), -h2, h0.w
|
||||
7: texpkb h1.w(TRUE), v5, #0
|
||||
9: addh h0.x(TRUE), h0.z, -h1.w
|
||||
10: addh h3.w(TRUE), h0.z, h1
|
||||
11: texpkb h2.w(TRUE), v5.zwzz, #0
|
||||
13: addh h0.z(TRUE), h3.w, -h2.w
|
||||
14: addh h0.x(TRUE), h2.w, h0
|
||||
15: nrmh h1.xz(TRUE), h0_n
|
||||
16: minh_m8 h0.x(TRUE), |h1|, |h1.z|
|
||||
17: maxh h4.w(TRUE), h0, h1
|
||||
18: divx h2.xy(TRUE), h1_n.xzzw, h0_n
|
||||
19: movr r1.zw(TRUE), v4.xxxy
|
||||
20: madr r2.xz(TRUE), -h1, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zzww, r1.zzww
|
||||
22: minh h5.w(TRUE), h0, h1
|
||||
23: texpkb h0(TRUE), r2.xzxx, #0
|
||||
25: madr r0.zw(TRUE), h1.xzxz, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w), r1
|
||||
27: maxh h4.x(TRUE), h2.z, h2.w
|
||||
28: texpkb h1(TRUE), r0.zwzz, #0
|
||||
30: addh_d2 h1(TRUE), h0, h1
|
||||
31: madr r0.xy(TRUE), -h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
|
||||
33: texpkb h0(TRUE), r0, #0
|
||||
35: minh h4.z(TRUE), h2, h2.w
|
||||
36: fenct TRUE
|
||||
37: madr r1.xy(TRUE), h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
|
||||
39: texpkb h2(TRUE), r1, #0
|
||||
41: addh_d2 h0(TRUE), h0, h2
|
||||
42: maxh h2.w(TRUE), h4, h4.x
|
||||
43: minh h2.x(TRUE), h5.w, h4.z
|
||||
44: addh_d2 h0(TRUE), h0, h1
|
||||
45: slth h2.x(TRUE), h0.w, h2
|
||||
46: sgth h2.w(TRUE), h0, h2
|
||||
47: movh h0(TRUE), h0
|
||||
48: addx.c0 rc(TRUE), h2, h2.w
|
||||
49: movh h0(c0.NE.x), h1
|
||||
|
||||
IPU0 ------ Simplified schedule: --------
|
||||
Pass | Unit | uOp | PC: Op
|
||||
-----+--------+------+-------------------------
|
||||
1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
|
||||
| TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
|
||||
| SCB1 | add | 2: ADDh h2.z, h0.--w-, const.--x-;
|
||||
| | |
|
||||
2 | SCT0/1 | mov | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;
|
||||
| TEX | txl | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;
|
||||
| SCB1 | add | 6: ADDh h0.z,-h2, h0.--w-;
|
||||
| | |
|
||||
3 | SCT0/1 | mov | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0;
|
||||
| TEX | txl | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0;
|
||||
| SCB0 | add | 9: ADDh h0.x, h0.z---,-h1.w---;
|
||||
| SCB1 | add | 10: ADDh h3.w, h0.---z, h1;
|
||||
| | |
|
||||
4 | SCT0/1 | mov | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
|
||||
| TEX | txl | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
|
||||
| SCB0 | add | 14: ADDh h0.x, h2.w---, h0;
|
||||
| SCB1 | add | 13: ADDh h0.z, h3.--w-,-h2.--w-;
|
||||
| | |
|
||||
5 | SCT1 | mov | 15: NRMh h1.xz, h0;
|
||||
| SRB | nrm | 15: NRMh h1.xz, h0;
|
||||
| SCB0 | min | 16: MINh*8 h0.x, |h1|, |h1.z---|;
|
||||
| SCB1 | max | 17: MAXh h4.w, h0, h1;
|
||||
| | |
|
||||
6 | SCT0 | div | 18: DIVx h2.xy, h1.xz--, h0;
|
||||
| SCT1 | mov | 19: MOVr r1.zw, g[TEX0].--xy;
|
||||
| SCB0 | mad | 20: MADr r2.xz,-h1, const.z-w-, r1.z-w-;
|
||||
| SCB1 | min | 22: MINh h5.w, h0, h1;
|
||||
| | |
|
||||
7 | SCT0/1 | mov | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0;
|
||||
| TEX | txl | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0;
|
||||
| SCB0 | max | 27: MAXh h4.x, h2.z---, h2.w---;
|
||||
| SCB1 | mad | 25: MADr r0.zw, h1.--xz, const, r1;
|
||||
| | |
|
||||
8 | SCT0/1 | mov | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0;
|
||||
| TEX | txl | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0;
|
||||
| SCB0/1 | add | 30: ADDh/2 h1, h0, h1;
|
||||
| | |
|
||||
9 | SCT0 | mad | 31: MADr r0.xy,-h2, const.xy--, r1.zw--;
|
||||
| SCT1 | mov | 33: TXLr h0, r0, const.zzzz, TEX0;
|
||||
| TEX | txl | 33: TXLr h0, r0, const.zzzz, TEX0;
|
||||
| SCB1 | min | 35: MINh h4.z, h2, h2.--w-;
|
||||
| | |
|
||||
10 | SCT0 | mad | 37: MADr r1.xy, h2, const.xy--, r1.zw--;
|
||||
| SCT1 | mov | 39: TXLr h2, r1, const.zzzz, TEX0;
|
||||
| TEX | txl | 39: TXLr h2, r1, const.zzzz, TEX0;
|
||||
| SCB0/1 | add | 41: ADDh/2 h0, h0, h2;
|
||||
| | |
|
||||
11 | SCT0 | min | 43: MINh h2.x, h5.w---, h4.z---;
|
||||
| SCT1 | max | 42: MAXh h2.w, h4, h4.---x;
|
||||
| SCB0/1 | add | 44: ADDh/2 h0, h0, h1;
|
||||
| | |
|
||||
12 | SCT0 | set | 45: SLTh h2.x, h0.w---, h2;
|
||||
| SCT1 | set | 46: SGTh h2.w, h0, h2;
|
||||
| SCB0/1 | mul | 47: MOVh h0, h0;
|
||||
| | |
|
||||
13 | SCT0 | mad | 48: ADDxc0_s rc, h2, h2.w---;
|
||||
| SCB0/1 | mul | 49: MOVh h0(NE0.xxxx), h1;
|
||||
|
||||
Pass SCT TEX SCB
|
||||
1: 0% 100% 25%
|
||||
2: 0% 100% 25%
|
||||
3: 0% 100% 50%
|
||||
4: 0% 100% 50%
|
||||
5: 0% 0% 50%
|
||||
6: 100% 0% 75%
|
||||
7: 0% 100% 75%
|
||||
8: 0% 100% 100%
|
||||
9: 0% 100% 25%
|
||||
10: 0% 100% 100%
|
||||
11: 50% 0% 100%
|
||||
12: 50% 0% 100%
|
||||
13: 25% 0% 100%
|
||||
|
||||
MEAN: 17% 61% 67%
|
||||
|
||||
Pass SCT0 SCT1 TEX SCB0 SCB1
|
||||
1: 0% 0% 100% 0% 100%
|
||||
2: 0% 0% 100% 0% 100%
|
||||
3: 0% 0% 100% 100% 100%
|
||||
4: 0% 0% 100% 100% 100%
|
||||
5: 0% 0% 0% 100% 100%
|
||||
6: 100% 100% 0% 100% 100%
|
||||
7: 0% 0% 100% 100% 100%
|
||||
8: 0% 0% 100% 100% 100%
|
||||
9: 0% 0% 100% 0% 100%
|
||||
10: 0% 0% 100% 100% 100%
|
||||
11: 100% 100% 0% 100% 100%
|
||||
12: 100% 100% 0% 100% 100%
|
||||
13: 100% 0% 0% 100% 100%
|
||||
|
||||
MEAN: 30% 23% 61% 76% 100%
|
||||
Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5
|
||||
Results 13 cycles, 3 r regs, 923,076,923 pixels/s
|
||||
============================================================================*/
|
||||
#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 0)
|
||||
/*--------------------------------------------------------------------------*/
|
||||
#pragma disablepc all
|
||||
#pragma option O3
|
||||
#pragma option OutColorPrec=fp16
|
||||
#pragma texformat default RGBA8
|
||||
/*==========================================================================*/
|
||||
half4 FxaaPixelShader(
|
||||
// {xy} = center of pixel
|
||||
float2 pos,
|
||||
// {xy__} = upper left of pixel
|
||||
// {__zw} = lower right of pixel
|
||||
float4 posPos,
|
||||
// {rgb_} = color in linear or perceptual color space
|
||||
// {___a} = luma in perceptual color space (not linear)
|
||||
sampler2D tex,
|
||||
// This must be from a constant/uniform.
|
||||
// {xy} = rcpFrame not used on PS3
|
||||
float2 rcpFrame,
|
||||
// This must be from a constant/uniform.
|
||||
// {x___} = 2.0/screenWidthInPixels
|
||||
// {_y__} = 2.0/screenHeightInPixels
|
||||
// {__z_} = 0.5/screenWidthInPixels
|
||||
// {___w} = 0.5/screenHeightInPixels
|
||||
float4 rcpFrameOpt
|
||||
) {
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (1)
|
||||
half4 dir;
|
||||
half4 lumaNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0));
|
||||
lumaNe.w += half(1.0/512.0);
|
||||
dir.x = -lumaNe.w;
|
||||
dir.z = -lumaNe.w;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (2)
|
||||
half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0));
|
||||
dir.x += lumaSw.w;
|
||||
dir.z += lumaSw.w;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (3)
|
||||
half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0));
|
||||
dir.x -= lumaNw.w;
|
||||
dir.z += lumaNw.w;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (4)
|
||||
half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0));
|
||||
dir.x += lumaSe.w;
|
||||
dir.z -= lumaSe.w;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (5)
|
||||
half4 dir1_pos;
|
||||
dir1_pos.xy = normalize(dir.xyz).xz;
|
||||
half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (6)
|
||||
half4 dir2_pos;
|
||||
dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));
|
||||
dir1_pos.zw = pos.xy;
|
||||
dir2_pos.zw = pos.xy;
|
||||
half4 temp1N;
|
||||
temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (7)
|
||||
temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));
|
||||
half4 rgby1;
|
||||
rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (8)
|
||||
rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));
|
||||
rgby1 = (temp1N + rgby1) * 0.5;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (9)
|
||||
half4 temp2N;
|
||||
temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;
|
||||
temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (10)
|
||||
half4 rgby2;
|
||||
rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;
|
||||
rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));
|
||||
rgby2 = (temp2N + rgby2) * 0.5;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (11)
|
||||
// compilier moves these scalar ops up to other cycles
|
||||
half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));
|
||||
half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));
|
||||
rgby2 = (rgby2 + rgby1) * 0.5;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (12)
|
||||
bool twoTapLt = rgby2.w < lumaMin;
|
||||
bool twoTapGt = rgby2.w > lumaMax;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (13)
|
||||
if(twoTapLt || twoTapGt) rgby2 = rgby1;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
return rgby2; }
|
||||
/*==========================================================================*/
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*============================================================================
|
||||
|
||||
FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (WITH EARLY EXIT)
|
||||
|
||||
==============================================================================
|
||||
The code mostly matches the assembly.
|
||||
I have a feeling that 14 cycles is possible, but was not able to get there.
|
||||
Might have to increase register count to get full performance.
|
||||
Note this shader does not use perspective interpolation.
|
||||
|
||||
Use the following cgc options,
|
||||
|
||||
--fenable-bx2 --fastmath --fastprecision --nofloatbindings
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
NVSHADERPERF OUTPUT
|
||||
------------------------------------------------------------------------------
|
||||
For reference and to aid in debug, output of NVShaderPerf should match this,
|
||||
|
||||
Shader to schedule:
|
||||
0: texpkb h0.w(TRUE), v5.zyxx, #0
|
||||
2: addh h2.y(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x
|
||||
4: texpkb h1.w(TRUE), v5.xwxx, #0
|
||||
6: addh h0.x(TRUE), h1.w, -h2.y
|
||||
7: texpkb h2.w(TRUE), v5.zwzz, #0
|
||||
9: minh h4.w(TRUE), h2.y, h2
|
||||
10: maxh h5.x(TRUE), h2.y, h2.w
|
||||
11: texpkb h0.w(TRUE), v5, #0
|
||||
13: addh h3.w(TRUE), -h0, h0.x
|
||||
14: addh h0.x(TRUE), h0.w, h0
|
||||
15: addh h0.z(TRUE), -h2.w, h0.x
|
||||
16: addh h0.x(TRUE), h2.w, h3.w
|
||||
17: minh h5.y(TRUE), h0.w, h1.w
|
||||
18: nrmh h2.xz(TRUE), h0_n
|
||||
19: minh_m8 h2.w(TRUE), |h2.x|, |h2.z|
|
||||
20: divx h4.xy(TRUE), h2_n.xzzw, h2_n.w
|
||||
21: movr r1.zw(TRUE), v4.xxxy
|
||||
22: maxh h2.w(TRUE), h0, h1
|
||||
23: fenct TRUE
|
||||
24: madr r0.xy(TRUE), -h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz
|
||||
26: texpkb h0(TRUE), r0, #0
|
||||
28: maxh h5.x(TRUE), h2.w, h5
|
||||
29: minh h5.w(TRUE), h5.y, h4
|
||||
30: madr r1.xy(TRUE), h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz
|
||||
32: texpkb h2(TRUE), r1, #0
|
||||
34: addh_d2 h2(TRUE), h0, h2
|
||||
35: texpkb h1(TRUE), v4, #0
|
||||
37: maxh h5.y(TRUE), h5.x, h1.w
|
||||
38: minh h4.w(TRUE), h1, h5
|
||||
39: madr r0.xy(TRUE), -h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
|
||||
41: texpkb h0(TRUE), r0, #0
|
||||
43: addh_m8 h5.z(TRUE), h5.y, -h4.w
|
||||
44: madr r2.xy(TRUE), h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
|
||||
46: texpkb h3(TRUE), r2, #0
|
||||
48: addh_d2 h0(TRUE), h0, h3
|
||||
49: addh_d2 h3(TRUE), h0, h2
|
||||
50: movh h0(TRUE), h3
|
||||
51: slth h3.x(TRUE), h3.w, h5.w
|
||||
52: sgth h3.w(TRUE), h3, h5.x
|
||||
53: addx.c0 rc(TRUE), h3.x, h3
|
||||
54: slth.c0 rc(TRUE), h5.z, h5
|
||||
55: movh h0(c0.NE.w), h2
|
||||
56: movh h0(c0.NE.x), h1
|
||||
|
||||
IPU0 ------ Simplified schedule: --------
|
||||
Pass | Unit | uOp | PC: Op
|
||||
-----+--------+------+-------------------------
|
||||
1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
|
||||
| TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
|
||||
| SCB0 | add | 2: ADDh h2.y, h0.-w--, const.-x--;
|
||||
| | |
|
||||
2 | SCT0/1 | mov | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;
|
||||
| TEX | txl | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;
|
||||
| SCB0 | add | 6: ADDh h0.x, h1.w---,-h2.y---;
|
||||
| | |
|
||||
3 | SCT0/1 | mov | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
|
||||
| TEX | txl | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
|
||||
| SCB0 | max | 10: MAXh h5.x, h2.y---, h2.w---;
|
||||
| SCB1 | min | 9: MINh h4.w, h2.---y, h2;
|
||||
| | |
|
||||
4 | SCT0/1 | mov | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0;
|
||||
| TEX | txl | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0;
|
||||
| SCB0 | add | 14: ADDh h0.x, h0.w---, h0;
|
||||
| SCB1 | add | 13: ADDh h3.w,-h0, h0.---x;
|
||||
| | |
|
||||
5 | SCT0 | mad | 16: ADDh h0.x, h2.w---, h3.w---;
|
||||
| SCT1 | mad | 15: ADDh h0.z,-h2.--w-, h0.--x-;
|
||||
| SCB0 | min | 17: MINh h5.y, h0.-w--, h1.-w--;
|
||||
| | |
|
||||
6 | SCT1 | mov | 18: NRMh h2.xz, h0;
|
||||
| SRB | nrm | 18: NRMh h2.xz, h0;
|
||||
| SCB1 | min | 19: MINh*8 h2.w, |h2.---x|, |h2.---z|;
|
||||
| | |
|
||||
7 | SCT0 | div | 20: DIVx h4.xy, h2.xz--, h2.ww--;
|
||||
| SCT1 | mov | 21: MOVr r1.zw, g[TEX0].--xy;
|
||||
| SCB1 | max | 22: MAXh h2.w, h0, h1;
|
||||
| | |
|
||||
8 | SCT0 | mad | 24: MADr r0.xy,-h2.xz--, const.zw--, r1.zw--;
|
||||
| SCT1 | mov | 26: TXLr h0, r0, const.xxxx, TEX0;
|
||||
| TEX | txl | 26: TXLr h0, r0, const.xxxx, TEX0;
|
||||
| SCB0 | max | 28: MAXh h5.x, h2.w---, h5;
|
||||
| SCB1 | min | 29: MINh h5.w, h5.---y, h4;
|
||||
| | |
|
||||
9 | SCT0 | mad | 30: MADr r1.xy, h2.xz--, const.zw--, r1.zw--;
|
||||
| SCT1 | mov | 32: TXLr h2, r1, const.xxxx, TEX0;
|
||||
| TEX | txl | 32: TXLr h2, r1, const.xxxx, TEX0;
|
||||
| SCB0/1 | add | 34: ADDh/2 h2, h0, h2;
|
||||
| | |
|
||||
10 | SCT0/1 | mov | 35: TXLr h1, g[TEX0], const.xxxx, TEX0;
|
||||
| TEX | txl | 35: TXLr h1, g[TEX0], const.xxxx, TEX0;
|
||||
| SCB0 | max | 37: MAXh h5.y, h5.-x--, h1.-w--;
|
||||
| SCB1 | min | 38: MINh h4.w, h1, h5;
|
||||
| | |
|
||||
11 | SCT0 | mad | 39: MADr r0.xy,-h4, const.xy--, r1.zw--;
|
||||
| SCT1 | mov | 41: TXLr h0, r0, const.zzzz, TEX0;
|
||||
| TEX | txl | 41: TXLr h0, r0, const.zzzz, TEX0;
|
||||
| SCB0 | mad | 44: MADr r2.xy, h4, const.xy--, r1.zw--;
|
||||
| SCB1 | add | 43: ADDh*8 h5.z, h5.--y-,-h4.--w-;
|
||||
| | |
|
||||
12 | SCT0/1 | mov | 46: TXLr h3, r2, const.xxxx, TEX0;
|
||||
| TEX | txl | 46: TXLr h3, r2, const.xxxx, TEX0;
|
||||
| SCB0/1 | add | 48: ADDh/2 h0, h0, h3;
|
||||
| | |
|
||||
13 | SCT0/1 | mad | 49: ADDh/2 h3, h0, h2;
|
||||
| SCB0/1 | mul | 50: MOVh h0, h3;
|
||||
| | |
|
||||
14 | SCT0 | set | 51: SLTh h3.x, h3.w---, h5.w---;
|
||||
| SCT1 | set | 52: SGTh h3.w, h3, h5.---x;
|
||||
| SCB0 | set | 54: SLThc0 rc, h5.z---, h5;
|
||||
| SCB1 | add | 53: ADDxc0_s rc, h3.---x, h3;
|
||||
| | |
|
||||
15 | SCT0/1 | mul | 55: MOVh h0(NE0.wwww), h2;
|
||||
| SCB0/1 | mul | 56: MOVh h0(NE0.xxxx), h1;
|
||||
|
||||
Pass SCT TEX SCB
|
||||
1: 0% 100% 25%
|
||||
2: 0% 100% 25%
|
||||
3: 0% 100% 50%
|
||||
4: 0% 100% 50%
|
||||
5: 50% 0% 25%
|
||||
6: 0% 0% 25%
|
||||
7: 100% 0% 25%
|
||||
8: 0% 100% 50%
|
||||
9: 0% 100% 100%
|
||||
10: 0% 100% 50%
|
||||
11: 0% 100% 75%
|
||||
12: 0% 100% 100%
|
||||
13: 100% 0% 100%
|
||||
14: 50% 0% 50%
|
||||
15: 100% 0% 100%
|
||||
|
||||
MEAN: 26% 60% 56%
|
||||
|
||||
Pass SCT0 SCT1 TEX SCB0 SCB1
|
||||
1: 0% 0% 100% 100% 0%
|
||||
2: 0% 0% 100% 100% 0%
|
||||
3: 0% 0% 100% 100% 100%
|
||||
4: 0% 0% 100% 100% 100%
|
||||
5: 100% 100% 0% 100% 0%
|
||||
6: 0% 0% 0% 0% 100%
|
||||
7: 100% 100% 0% 0% 100%
|
||||
8: 0% 0% 100% 100% 100%
|
||||
9: 0% 0% 100% 100% 100%
|
||||
10: 0% 0% 100% 100% 100%
|
||||
11: 0% 0% 100% 100% 100%
|
||||
12: 0% 0% 100% 100% 100%
|
||||
13: 100% 100% 0% 100% 100%
|
||||
14: 100% 100% 0% 100% 100%
|
||||
15: 100% 100% 0% 100% 100%
|
||||
|
||||
MEAN: 33% 33% 60% 86% 80%
|
||||
Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5
|
||||
Results 15 cycles, 3 r regs, 800,000,000 pixels/s
|
||||
============================================================================*/
|
||||
#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 1)
|
||||
/*--------------------------------------------------------------------------*/
|
||||
#pragma disablepc all
|
||||
#pragma option O2
|
||||
#pragma option OutColorPrec=fp16
|
||||
#pragma texformat default RGBA8
|
||||
/*==========================================================================*/
|
||||
half4 FxaaPixelShader(
|
||||
// {xy} = center of pixel
|
||||
float2 pos,
|
||||
// {xy__} = upper left of pixel
|
||||
// {__zw} = lower right of pixel
|
||||
float4 posPos,
|
||||
// {rgb_} = color in linear or perceptual color space
|
||||
// {___a} = luma in perceptual color space (not linear)
|
||||
sampler2D tex,
|
||||
// This must be from a constant/uniform.
|
||||
// {xy} = rcpFrame not used on PS3
|
||||
float2 rcpFrame,
|
||||
// This must be from a constant/uniform.
|
||||
// {x___} = 2.0/screenWidthInPixels
|
||||
// {_y__} = 2.0/screenHeightInPixels
|
||||
// {__z_} = 0.5/screenWidthInPixels
|
||||
// {___w} = 0.5/screenHeightInPixels
|
||||
float4 rcpFrameOpt
|
||||
) {
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (1)
|
||||
half4 rgbyNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0));
|
||||
half lumaNe = rgbyNe.w + half(1.0/512.0);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (2)
|
||||
half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0));
|
||||
half lumaSwNegNe = lumaSw.w - lumaNe;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (3)
|
||||
half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0));
|
||||
half lumaMaxNwSw = max(lumaNw.w, lumaSw.w);
|
||||
half lumaMinNwSw = min(lumaNw.w, lumaSw.w);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (4)
|
||||
half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0));
|
||||
half dirZ = lumaNw.w + lumaSwNegNe;
|
||||
half dirX = -lumaNw.w + lumaSwNegNe;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (5)
|
||||
half3 dir;
|
||||
dir.y = 0.0;
|
||||
dir.x = lumaSe.w + dirX;
|
||||
dir.z = -lumaSe.w + dirZ;
|
||||
half lumaMinNeSe = min(lumaNe, lumaSe.w);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (6)
|
||||
half4 dir1_pos;
|
||||
dir1_pos.xy = normalize(dir).xz;
|
||||
half dirAbsMinTimes8 = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (7)
|
||||
half4 dir2_pos;
|
||||
dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimes8, half(-2.0), half(2.0));
|
||||
dir1_pos.zw = pos.xy;
|
||||
dir2_pos.zw = pos.xy;
|
||||
half lumaMaxNeSe = max(lumaNe, lumaSe.w);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (8)
|
||||
half4 temp1N;
|
||||
temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;
|
||||
temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));
|
||||
half lumaMax = max(lumaMaxNwSw, lumaMaxNeSe);
|
||||
half lumaMin = min(lumaMinNwSw, lumaMinNeSe);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (9)
|
||||
half4 rgby1;
|
||||
rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;
|
||||
rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));
|
||||
rgby1 = (temp1N + rgby1) * 0.5;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (10)
|
||||
half4 rgbyM = h4tex2Dlod(tex, half4(pos.xy, 0.0, 0.0));
|
||||
half lumaMaxM = max(lumaMax, rgbyM.w);
|
||||
half lumaMinM = min(lumaMin, rgbyM.w);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (11)
|
||||
half4 temp2N;
|
||||
temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;
|
||||
temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));
|
||||
half4 rgby2;
|
||||
rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;
|
||||
half lumaRangeM = (lumaMaxM - lumaMinM) / FXAA_CONSOLE_EDGE_THRESHOLD;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (12)
|
||||
rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));
|
||||
rgby2 = (temp2N + rgby2) * 0.5;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (13)
|
||||
rgby2 = (rgby2 + rgby1) * 0.5;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (14)
|
||||
bool twoTapLt = rgby2.w < lumaMin;
|
||||
bool twoTapGt = rgby2.w > lumaMax;
|
||||
bool earlyExit = lumaRangeM < lumaMax;
|
||||
bool twoTap = twoTapLt || twoTapGt;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// (15)
|
||||
if(twoTap) rgby2 = rgby1;
|
||||
if(earlyExit) rgby2 = rgbyM;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
return rgby2; }
|
||||
/*==========================================================================*/
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*============================================================================
|
||||
|
||||
FXAA3 CONSOLE - PC PIXEL SHADER
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
Using a modified version of the PS3 version here to best target old hardware.
|
||||
============================================================================*/
|
||||
#if (FXAA_PC_CONSOLE == 1)
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 FxaaPixelShader(
|
||||
// {xy} = center of pixel
|
||||
float2 pos,
|
||||
// {xy__} = upper left of pixel
|
||||
// {__zw} = lower right of pixel
|
||||
float4 posPos,
|
||||
// {rgb_} = color in linear or perceptual color space
|
||||
// {___a} = alpha output is junk value
|
||||
FxaaTex tex,
|
||||
// This must be from a constant/uniform.
|
||||
// {xy} = rcpFrame not used on PC version of FXAA Console
|
||||
float2 rcpFrame,
|
||||
// This must be from a constant/uniform.
|
||||
// {x___} = 2.0/screenWidthInPixels
|
||||
// {_y__} = 2.0/screenHeightInPixels
|
||||
// {__z_} = 0.5/screenWidthInPixels
|
||||
// {___w} = 0.5/screenHeightInPixels
|
||||
float4 rcpFrameOpt
|
||||
) {
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 dir;
|
||||
dir.y = 0.0;
|
||||
half4 lumaNe = FxaaTexTop(tex, posPos.zy);
|
||||
lumaNe.w += half(1.0/384.0);
|
||||
dir.x = -lumaNe.w;
|
||||
dir.z = -lumaNe.w;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 lumaSw = FxaaTexTop(tex, posPos.xw);
|
||||
dir.x += lumaSw.w;
|
||||
dir.z += lumaSw.w;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 lumaNw = FxaaTexTop(tex, posPos.xy);
|
||||
dir.x -= lumaNw.w;
|
||||
dir.z += lumaNw.w;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 lumaSe = FxaaTexTop(tex, posPos.zw);
|
||||
dir.x += lumaSe.w;
|
||||
dir.z -= lumaSe.w;
|
||||
/*==========================================================================*/
|
||||
#if (FXAA_EARLY_EXIT == 1)
|
||||
half4 rgbyM = FxaaTexTop(tex, pos.xy);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));
|
||||
half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half lumaMinM = min(lumaMin, rgbyM.w);
|
||||
half lumaMaxM = max(lumaMax, rgbyM.w);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD))
|
||||
#if (FXAA_DISCARD == 1)
|
||||
FxaaDiscard;
|
||||
#else
|
||||
return rgbyM;
|
||||
#endif
|
||||
#endif
|
||||
/*==========================================================================*/
|
||||
half4 dir1_pos;
|
||||
dir1_pos.xy = normalize(dir.xyz).xz;
|
||||
half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 dir2_pos;
|
||||
dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));
|
||||
dir1_pos.zw = pos.xy;
|
||||
dir2_pos.zw = pos.xy;
|
||||
half4 temp1N;
|
||||
temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
temp1N = FxaaTexTop(tex, temp1N.xy);
|
||||
half4 rgby1;
|
||||
rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
rgby1 = FxaaTexTop(tex, rgby1.xy);
|
||||
rgby1 = (temp1N + rgby1) * 0.5;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 temp2N;
|
||||
temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;
|
||||
temp2N = FxaaTexTop(tex, temp2N.xy);
|
||||
/*--------------------------------------------------------------------------*/
|
||||
half4 rgby2;
|
||||
rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;
|
||||
rgby2 = FxaaTexTop(tex, rgby2.xy);
|
||||
rgby2 = (temp2N + rgby2) * 0.5;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
#if (FXAA_EARLY_EXIT == 0)
|
||||
half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));
|
||||
half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));
|
||||
#endif
|
||||
rgby2 = (rgby2 + rgby1) * 0.5;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
bool twoTapLt = rgby2.w < lumaMin;
|
||||
bool twoTapGt = rgby2.w > lumaMax;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
if(twoTapLt || twoTapGt) rgby2 = rgby1;
|
||||
/*--------------------------------------------------------------------------*/
|
||||
return rgby2; }
|
||||
/*==========================================================================*/
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*============================================================================
|
||||
|
||||
FXAA3 QUALITY - PC
|
||||
|
|
|
@ -855,823 +855,6 @@ static const char* fxaa_fx =
|
|||
"\n"
|
||||
"/*============================================================================\n"
|
||||
"\n"
|
||||
" FXAA3 CONSOLE - 360 PIXEL SHADER\n"
|
||||
"\n"
|
||||
"------------------------------------------------------------------------------\n"
|
||||
"Might be some optimizations left here,\n"
|
||||
"as of this latest change didn't have a PIX dump to verify if TEX bound.\n"
|
||||
"============================================================================*/\n"
|
||||
"#if (FXAA_360 == 1)\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"half4 FxaaPixelShader(\n"
|
||||
" // {xy} = center of pixel\n"
|
||||
" float2 pos,\n"
|
||||
" // {xy__} = upper left of pixel\n"
|
||||
" // {__zw} = lower right of pixel\n"
|
||||
" float4 posPos,\n"
|
||||
" // {rgb_} = color in linear or perceptual color space\n"
|
||||
" // {___a} = alpha output is junk value\n"
|
||||
" FxaaTex tex,\n"
|
||||
" // This must be from a constant/uniform.\n"
|
||||
" // {xy} = rcpFrame not used on PC version of FXAA Console\n"
|
||||
" float2 rcpFrame,\n"
|
||||
" // This must be from a constant/uniform.\n"
|
||||
" // {x___} = 2.0/screenWidthInPixels\n"
|
||||
" // {_y__} = 2.0/screenHeightInPixels\n"
|
||||
" // {__z_} = 0.5/screenWidthInPixels\n"
|
||||
" // {___w} = 0.5/screenHeightInPixels\n"
|
||||
" float4 rcpFrameOpt\n"
|
||||
") {\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 lumaNwNeSwSe;\n"
|
||||
" lumaNwNeSwSe.x = FxaaTexTop(tex, posPos.xy).w;\n"
|
||||
" lumaNwNeSwSe.y = FxaaTexTop(tex, posPos.zy).w;\n"
|
||||
" lumaNwNeSwSe.z = FxaaTexTop(tex, posPos.xw).w;\n"
|
||||
" lumaNwNeSwSe.w = FxaaTexTop(tex, posPos.zw).w;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 rgbyM = FxaaTexTop(tex, pos.xy);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" lumaNwNeSwSe.y += 1.0/384.0;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);\n"
|
||||
" half2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);\n"
|
||||
" half lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half lumaMinM = min(lumaMin, rgbyM.w);\n"
|
||||
" half lumaMaxM = max(lumaMax, rgbyM.w);\n"
|
||||
" if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD)) return rgbyM;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half2 dir;\n"
|
||||
" dir.x = dot(lumaNwNeSwSe, float4(-1.0, -1.0, 1.0, 1.0));\n"
|
||||
" dir.y = dot(lumaNwNeSwSe, float4( 1.0, -1.0, 1.0,-1.0));\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half2 dir1;\n"
|
||||
" dir1 = normalize(dir.xy);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half dirAbsMinTimesC = min(abs(dir1.x), abs(dir1.y)) * FXAA_CONSOLE_EDGE_SHARPNESS;\n"
|
||||
" half2 dir2;\n"
|
||||
" dir2 = clamp(dir1.xy / dirAbsMinTimesC, -2.0, 2.0);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 rgbyN1 = FxaaTexTop(tex, pos.xy - dir1 * rcpFrameOpt.zw);\n"
|
||||
" half4 rgbyP1 = FxaaTexTop(tex, pos.xy + dir1 * rcpFrameOpt.zw);\n"
|
||||
" half4 rgbyN2 = FxaaTexTop(tex, pos.xy - dir2 * rcpFrameOpt.xy);\n"
|
||||
" half4 rgbyP2 = FxaaTexTop(tex, pos.xy + dir2 * rcpFrameOpt.xy);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 rgbyA = rgbyN1 * 0.5 + rgbyP1 * 0.5;\n"
|
||||
" half4 rgbyB = rgbyN2 * 0.25 + rgbyP2 * 0.25 + rgbyA * 0.5;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" bool twoTap = (rgbyB.w < lumaMin) || (rgbyB.w > lumaMax);\n"
|
||||
" if(twoTap) rgbyB.xyz = rgbyA.xyz;\n"
|
||||
" return rgbyB; }\n"
|
||||
"/*==========================================================================*/\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/*============================================================================\n"
|
||||
"\n"
|
||||
" FXAA3 CONSOLE - 360 PIXEL SHADER OPTIMIZED PROTOTYPE\n"
|
||||
"\n"
|
||||
"------------------------------------------------------------------------------\n"
|
||||
"This prototype optimized version thanks to suggestions from Andy Luedke.\n"
|
||||
"Should be fully tex bound in all cases.\n"
|
||||
"As of the FXAA 3.10 release I have not tested this code,\n"
|
||||
"but at least the missing \";\" was fixed.\n"
|
||||
"If it does not work, please let me know so I can fix it.\n"
|
||||
"------------------------------------------------------------------------------\n"
|
||||
"Extra requirements,\n"
|
||||
"(1.) Different inputs: no posPos.\n"
|
||||
"(2.) Different inputs: alias three samplers with different exp bias settings!\n"
|
||||
"(3.) New constants: setup fxaaConst as described below.\n"
|
||||
"============================================================================*/\n"
|
||||
"#if (FXAA_360_OPT == 1)\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"[reduceTempRegUsage(4)]\n"
|
||||
"float4 FxaaPixelShader(\n"
|
||||
" // {xy} = center of pixel\n"
|
||||
" float2 pos,\n"
|
||||
" // Three samplers,\n"
|
||||
" // texExpBias0 = exponent bias 0\n"
|
||||
" // texExpBiasNeg1 = exponent bias -1\n"
|
||||
" // texExpBiasNeg2 = exponent bias -2\n"
|
||||
" // {rgb_} = color in linear or perceptual color space\n"
|
||||
" // {___a} = alpha output is junk value\n"
|
||||
" uniform sampler2D texExpBias0,\n"
|
||||
" uniform sampler2D texExpBiasNeg1,\n"
|
||||
" uniform sampler2D texExpBiasNeg2,\n"
|
||||
" // These must be in physical constant registers and NOT immedates\n"
|
||||
" // Immedates will result in compiler un-optimizing\n"
|
||||
" // width = screen width in pixels\n"
|
||||
" // height = screen height in pixels\n"
|
||||
" fxaaConstDir, // float4(1.0, -1.0, 0.25, -0.25);\n"
|
||||
" fxaaConstInner, // float4(0.5/width, 0.5/height, -0.5/width, -0.5/height);\n"
|
||||
" fxaaConstOuter // float4(8.0/width, 8.0/height, -4.0/width, -4.0/height);\n"
|
||||
") {\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" float4 lumaNwNeSwSe;\n"
|
||||
" asm { \n"
|
||||
" tfetch2D lumaNwNeSwSe.w___, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false\n"
|
||||
" tfetch2D lumaNwNeSwSe._w__, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false\n"
|
||||
" tfetch2D lumaNwNeSwSe.__w_, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false\n"
|
||||
" tfetch2D lumaNwNeSwSe.___w, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false\n"
|
||||
" };\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" lumaNwNeSwSe.y += 1.0/384.0;\n"
|
||||
" float2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);\n"
|
||||
" float2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);\n"
|
||||
" float lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);\n"
|
||||
" float lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" float4 rgbyM = tex2Dlod(texExpBias0, float4(pos.xy, 0.0, 0.0));\n"
|
||||
" float4 lumaMinM = min(lumaMin, rgbyM.w);\n"
|
||||
" float4 lumaMaxM = max(lumaMax, rgbyM.w);\n"
|
||||
" if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD)) return rgbyM;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" float2 dir;\n"
|
||||
" dir.x = dot(lumaNwNeSwSe, fxaaConstDir.yyxx);\n"
|
||||
" dir.y = dot(lumaNwNeSwSe, fxaaConstDir.xyxy);\n"
|
||||
" dir = normalize(dir);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" float4 dir1 = dir.xyxy * fxaaConstInner.xyzw;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" float4 dir2;\n"
|
||||
" float dirAbsMinTimesC = min(abs(dir.x), abs(dir.y));\n"
|
||||
" dir2 = saturate(fxaaConstOuter.zzww * dir.xyxy / FXAA_CONSOLE_EDGE_SHARPNESS / dirAbsMinTimesC + 0.5);\n"
|
||||
" dir2 = dir2 * fxaaConstOuter.xyxy + fxaaConstOuter.zwzw;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" float4 rgbyN1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.xy, 0.0, 0.0));\n"
|
||||
" float4 rgbyP1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.zw, 0.0, 0.0));\n"
|
||||
" float4 rgbyN2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.xy, 0.0, 0.0));\n"
|
||||
" float4 rgbyP2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.zw, 0.0, 0.0));\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 rgbyA = rgbyN1 + rgbyP1;\n"
|
||||
" half4 rgbyB = rgbyN2 + rgbyP2 * 0.5 + rgbyA;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" float4 rgbyR = ((rgbyB.w - lumaMax) > 0.0) ? rgbyA : rgbyB;\n"
|
||||
" rgbyR = ((rgbyB.w - lumaMin) > 0.0) ? rgbyR : rgbyA;\n"
|
||||
" return rgbyR; }\n"
|
||||
"/*==========================================================================*/\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/*============================================================================\n"
|
||||
"\n"
|
||||
" FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (NO EARLY EXIT)\n"
|
||||
"\n"
|
||||
"==============================================================================\n"
|
||||
"The code below does not exactly match the assembly.\n"
|
||||
"I have a feeling that 12 cycles is possible, but was not able to get there.\n"
|
||||
"Might have to increase register count to get full performance.\n"
|
||||
"Note this shader does not use perspective interpolation.\n"
|
||||
"\n"
|
||||
"Use the following cgc options,\n"
|
||||
"\n"
|
||||
" --fenable-bx2 --fastmath --fastprecision --nofloatbindings\n"
|
||||
"\n"
|
||||
"------------------------------------------------------------------------------\n"
|
||||
" NVSHADERPERF OUTPUT\n"
|
||||
"------------------------------------------------------------------------------\n"
|
||||
"For reference and to aid in debug, output of NVShaderPerf should match this,\n"
|
||||
"\n"
|
||||
"Shader to schedule:\n"
|
||||
" 0: texpkb h0.w(TRUE), v5.zyxx, #0\n"
|
||||
" 2: addh h2.z(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x\n"
|
||||
" 4: texpkb h0.w(TRUE), v5.xwxx, #0\n"
|
||||
" 6: addh h0.z(TRUE), -h2, h0.w\n"
|
||||
" 7: texpkb h1.w(TRUE), v5, #0\n"
|
||||
" 9: addh h0.x(TRUE), h0.z, -h1.w\n"
|
||||
" 10: addh h3.w(TRUE), h0.z, h1\n"
|
||||
" 11: texpkb h2.w(TRUE), v5.zwzz, #0\n"
|
||||
" 13: addh h0.z(TRUE), h3.w, -h2.w\n"
|
||||
" 14: addh h0.x(TRUE), h2.w, h0\n"
|
||||
" 15: nrmh h1.xz(TRUE), h0_n\n"
|
||||
" 16: minh_m8 h0.x(TRUE), |h1|, |h1.z|\n"
|
||||
" 17: maxh h4.w(TRUE), h0, h1\n"
|
||||
" 18: divx h2.xy(TRUE), h1_n.xzzw, h0_n\n"
|
||||
" 19: movr r1.zw(TRUE), v4.xxxy\n"
|
||||
" 20: madr r2.xz(TRUE), -h1, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zzww, r1.zzww\n"
|
||||
" 22: minh h5.w(TRUE), h0, h1\n"
|
||||
" 23: texpkb h0(TRUE), r2.xzxx, #0\n"
|
||||
" 25: madr r0.zw(TRUE), h1.xzxz, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w), r1\n"
|
||||
" 27: maxh h4.x(TRUE), h2.z, h2.w\n"
|
||||
" 28: texpkb h1(TRUE), r0.zwzz, #0\n"
|
||||
" 30: addh_d2 h1(TRUE), h0, h1\n"
|
||||
" 31: madr r0.xy(TRUE), -h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz\n"
|
||||
" 33: texpkb h0(TRUE), r0, #0\n"
|
||||
" 35: minh h4.z(TRUE), h2, h2.w\n"
|
||||
" 36: fenct TRUE\n"
|
||||
" 37: madr r1.xy(TRUE), h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz\n"
|
||||
" 39: texpkb h2(TRUE), r1, #0\n"
|
||||
" 41: addh_d2 h0(TRUE), h0, h2\n"
|
||||
" 42: maxh h2.w(TRUE), h4, h4.x\n"
|
||||
" 43: minh h2.x(TRUE), h5.w, h4.z\n"
|
||||
" 44: addh_d2 h0(TRUE), h0, h1\n"
|
||||
" 45: slth h2.x(TRUE), h0.w, h2\n"
|
||||
" 46: sgth h2.w(TRUE), h0, h2\n"
|
||||
" 47: movh h0(TRUE), h0\n"
|
||||
" 48: addx.c0 rc(TRUE), h2, h2.w\n"
|
||||
" 49: movh h0(c0.NE.x), h1\n"
|
||||
"\n"
|
||||
"IPU0 ------ Simplified schedule: --------\n"
|
||||
"Pass | Unit | uOp | PC: Op\n"
|
||||
"-----+--------+------+-------------------------\n"
|
||||
" 1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;\n"
|
||||
" | SCB1 | add | 2: ADDh h2.z, h0.--w-, const.--x-;\n"
|
||||
" | | |\n"
|
||||
" 2 | SCT0/1 | mov | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;\n"
|
||||
" | SCB1 | add | 6: ADDh h0.z,-h2, h0.--w-;\n"
|
||||
" | | |\n"
|
||||
" 3 | SCT0/1 | mov | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0;\n"
|
||||
" | SCB0 | add | 9: ADDh h0.x, h0.z---,-h1.w---;\n"
|
||||
" | SCB1 | add | 10: ADDh h3.w, h0.---z, h1;\n"
|
||||
" | | |\n"
|
||||
" 4 | SCT0/1 | mov | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;\n"
|
||||
" | SCB0 | add | 14: ADDh h0.x, h2.w---, h0;\n"
|
||||
" | SCB1 | add | 13: ADDh h0.z, h3.--w-,-h2.--w-;\n"
|
||||
" | | |\n"
|
||||
" 5 | SCT1 | mov | 15: NRMh h1.xz, h0;\n"
|
||||
" | SRB | nrm | 15: NRMh h1.xz, h0;\n"
|
||||
" | SCB0 | min | 16: MINh*8 h0.x, |h1|, |h1.z---|;\n"
|
||||
" | SCB1 | max | 17: MAXh h4.w, h0, h1;\n"
|
||||
" | | |\n"
|
||||
" 6 | SCT0 | div | 18: DIVx h2.xy, h1.xz--, h0;\n"
|
||||
" | SCT1 | mov | 19: MOVr r1.zw, g[TEX0].--xy;\n"
|
||||
" | SCB0 | mad | 20: MADr r2.xz,-h1, const.z-w-, r1.z-w-;\n"
|
||||
" | SCB1 | min | 22: MINh h5.w, h0, h1;\n"
|
||||
" | | |\n"
|
||||
" 7 | SCT0/1 | mov | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0;\n"
|
||||
" | SCB0 | max | 27: MAXh h4.x, h2.z---, h2.w---;\n"
|
||||
" | SCB1 | mad | 25: MADr r0.zw, h1.--xz, const, r1;\n"
|
||||
" | | |\n"
|
||||
" 8 | SCT0/1 | mov | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0;\n"
|
||||
" | SCB0/1 | add | 30: ADDh/2 h1, h0, h1;\n"
|
||||
" | | |\n"
|
||||
" 9 | SCT0 | mad | 31: MADr r0.xy,-h2, const.xy--, r1.zw--;\n"
|
||||
" | SCT1 | mov | 33: TXLr h0, r0, const.zzzz, TEX0;\n"
|
||||
" | TEX | txl | 33: TXLr h0, r0, const.zzzz, TEX0;\n"
|
||||
" | SCB1 | min | 35: MINh h4.z, h2, h2.--w-;\n"
|
||||
" | | |\n"
|
||||
" 10 | SCT0 | mad | 37: MADr r1.xy, h2, const.xy--, r1.zw--;\n"
|
||||
" | SCT1 | mov | 39: TXLr h2, r1, const.zzzz, TEX0;\n"
|
||||
" | TEX | txl | 39: TXLr h2, r1, const.zzzz, TEX0;\n"
|
||||
" | SCB0/1 | add | 41: ADDh/2 h0, h0, h2;\n"
|
||||
" | | |\n"
|
||||
" 11 | SCT0 | min | 43: MINh h2.x, h5.w---, h4.z---;\n"
|
||||
" | SCT1 | max | 42: MAXh h2.w, h4, h4.---x;\n"
|
||||
" | SCB0/1 | add | 44: ADDh/2 h0, h0, h1;\n"
|
||||
" | | |\n"
|
||||
" 12 | SCT0 | set | 45: SLTh h2.x, h0.w---, h2;\n"
|
||||
" | SCT1 | set | 46: SGTh h2.w, h0, h2;\n"
|
||||
" | SCB0/1 | mul | 47: MOVh h0, h0;\n"
|
||||
" | | |\n"
|
||||
" 13 | SCT0 | mad | 48: ADDxc0_s rc, h2, h2.w---;\n"
|
||||
" | SCB0/1 | mul | 49: MOVh h0(NE0.xxxx), h1;\n"
|
||||
" \n"
|
||||
"Pass SCT TEX SCB\n"
|
||||
" 1: 0\% 100\% 25\%\n"
|
||||
" 2: 0\% 100\% 25\%\n"
|
||||
" 3: 0\% 100\% 50\%\n"
|
||||
" 4: 0\% 100\% 50\%\n"
|
||||
" 5: 0\% 0\% 50\%\n"
|
||||
" 6: 100\% 0\% 75\%\n"
|
||||
" 7: 0\% 100\% 75\%\n"
|
||||
" 8: 0\% 100\% 100\%\n"
|
||||
" 9: 0\% 100\% 25\%\n"
|
||||
" 10: 0\% 100\% 100\%\n"
|
||||
" 11: 50\% 0\% 100\%\n"
|
||||
" 12: 50\% 0\% 100\%\n"
|
||||
" 13: 25\% 0\% 100\%\n"
|
||||
"\n"
|
||||
"MEAN: 17\% 61\% 67\%\n"
|
||||
"\n"
|
||||
"Pass SCT0 SCT1 TEX SCB0 SCB1\n"
|
||||
" 1: 0\% 0\% 100\% 0\% 100\%\n"
|
||||
" 2: 0\% 0\% 100\% 0\% 100\%\n"
|
||||
" 3: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 4: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 5: 0\% 0\% 0\% 100\% 100\%\n"
|
||||
" 6: 100\% 100\% 0\% 100\% 100\%\n"
|
||||
" 7: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 8: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 9: 0\% 0\% 100\% 0\% 100\%\n"
|
||||
" 10: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 11: 100\% 100\% 0\% 100\% 100\%\n"
|
||||
" 12: 100\% 100\% 0\% 100\% 100\%\n"
|
||||
" 13: 100\% 0\% 0\% 100\% 100\%\n"
|
||||
"\n"
|
||||
"MEAN: 30\% 23\% 61\% 76\% 100\%\n"
|
||||
"Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5\n"
|
||||
"Results 13 cycles, 3 r regs, 923,076,923 pixels/s\n"
|
||||
"============================================================================*/\n"
|
||||
"#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 0)\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"#pragma disablepc all\n"
|
||||
"#pragma option O3\n"
|
||||
"#pragma option OutColorPrec=fp16\n"
|
||||
"#pragma texformat default RGBA8\n"
|
||||
"/*==========================================================================*/\n"
|
||||
"half4 FxaaPixelShader(\n"
|
||||
" // {xy} = center of pixel\n"
|
||||
" float2 pos,\n"
|
||||
" // {xy__} = upper left of pixel\n"
|
||||
" // {__zw} = lower right of pixel\n"
|
||||
" float4 posPos,\n"
|
||||
" // {rgb_} = color in linear or perceptual color space\n"
|
||||
" // {___a} = luma in perceptual color space (not linear)\n"
|
||||
" sampler2D tex,\n"
|
||||
" // This must be from a constant/uniform.\n"
|
||||
" // {xy} = rcpFrame not used on PS3\n"
|
||||
" float2 rcpFrame,\n"
|
||||
" // This must be from a constant/uniform.\n"
|
||||
" // {x___} = 2.0/screenWidthInPixels\n"
|
||||
" // {_y__} = 2.0/screenHeightInPixels\n"
|
||||
" // {__z_} = 0.5/screenWidthInPixels\n"
|
||||
" // {___w} = 0.5/screenHeightInPixels\n"
|
||||
" float4 rcpFrameOpt\n"
|
||||
") {\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (1)\n"
|
||||
" half4 dir;\n"
|
||||
" half4 lumaNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0));\n"
|
||||
" lumaNe.w += half(1.0/512.0);\n"
|
||||
" dir.x = -lumaNe.w;\n"
|
||||
" dir.z = -lumaNe.w;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (2)\n"
|
||||
" half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0));\n"
|
||||
" dir.x += lumaSw.w;\n"
|
||||
" dir.z += lumaSw.w;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (3)\n"
|
||||
" half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0));\n"
|
||||
" dir.x -= lumaNw.w;\n"
|
||||
" dir.z += lumaNw.w;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (4)\n"
|
||||
" half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0));\n"
|
||||
" dir.x += lumaSe.w;\n"
|
||||
" dir.z -= lumaSe.w;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (5)\n"
|
||||
" half4 dir1_pos;\n"
|
||||
" dir1_pos.xy = normalize(dir.xyz).xz;\n"
|
||||
" half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (6)\n"
|
||||
" half4 dir2_pos;\n"
|
||||
" dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));\n"
|
||||
" dir1_pos.zw = pos.xy;\n"
|
||||
" dir2_pos.zw = pos.xy;\n"
|
||||
" half4 temp1N;\n"
|
||||
" temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (7)\n"
|
||||
" temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));\n"
|
||||
" half4 rgby1;\n"
|
||||
" rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (8)\n"
|
||||
" rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));\n"
|
||||
" rgby1 = (temp1N + rgby1) * 0.5;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (9)\n"
|
||||
" half4 temp2N;\n"
|
||||
" temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;\n"
|
||||
" temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (10)\n"
|
||||
" half4 rgby2;\n"
|
||||
" rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;\n"
|
||||
" rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));\n"
|
||||
" rgby2 = (temp2N + rgby2) * 0.5;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (11)\n"
|
||||
" // compilier moves these scalar ops up to other cycles\n"
|
||||
" half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));\n"
|
||||
" half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));\n"
|
||||
" rgby2 = (rgby2 + rgby1) * 0.5;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (12)\n"
|
||||
" bool twoTapLt = rgby2.w < lumaMin;\n"
|
||||
" bool twoTapGt = rgby2.w > lumaMax;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (13)\n"
|
||||
" if(twoTapLt || twoTapGt) rgby2 = rgby1;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" return rgby2; }\n"
|
||||
"/*==========================================================================*/\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/*============================================================================\n"
|
||||
"\n"
|
||||
" FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (WITH EARLY EXIT)\n"
|
||||
"\n"
|
||||
"==============================================================================\n"
|
||||
"The code mostly matches the assembly.\n"
|
||||
"I have a feeling that 14 cycles is possible, but was not able to get there.\n"
|
||||
"Might have to increase register count to get full performance.\n"
|
||||
"Note this shader does not use perspective interpolation.\n"
|
||||
"\n"
|
||||
"Use the following cgc options,\n"
|
||||
"\n"
|
||||
" --fenable-bx2 --fastmath --fastprecision --nofloatbindings\n"
|
||||
"\n"
|
||||
"------------------------------------------------------------------------------\n"
|
||||
" NVSHADERPERF OUTPUT\n"
|
||||
"------------------------------------------------------------------------------\n"
|
||||
"For reference and to aid in debug, output of NVShaderPerf should match this,\n"
|
||||
"\n"
|
||||
"Shader to schedule:\n"
|
||||
" 0: texpkb h0.w(TRUE), v5.zyxx, #0\n"
|
||||
" 2: addh h2.y(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x\n"
|
||||
" 4: texpkb h1.w(TRUE), v5.xwxx, #0\n"
|
||||
" 6: addh h0.x(TRUE), h1.w, -h2.y\n"
|
||||
" 7: texpkb h2.w(TRUE), v5.zwzz, #0\n"
|
||||
" 9: minh h4.w(TRUE), h2.y, h2\n"
|
||||
" 10: maxh h5.x(TRUE), h2.y, h2.w\n"
|
||||
" 11: texpkb h0.w(TRUE), v5, #0\n"
|
||||
" 13: addh h3.w(TRUE), -h0, h0.x\n"
|
||||
" 14: addh h0.x(TRUE), h0.w, h0\n"
|
||||
" 15: addh h0.z(TRUE), -h2.w, h0.x\n"
|
||||
" 16: addh h0.x(TRUE), h2.w, h3.w\n"
|
||||
" 17: minh h5.y(TRUE), h0.w, h1.w\n"
|
||||
" 18: nrmh h2.xz(TRUE), h0_n\n"
|
||||
" 19: minh_m8 h2.w(TRUE), |h2.x|, |h2.z|\n"
|
||||
" 20: divx h4.xy(TRUE), h2_n.xzzw, h2_n.w\n"
|
||||
" 21: movr r1.zw(TRUE), v4.xxxy\n"
|
||||
" 22: maxh h2.w(TRUE), h0, h1\n"
|
||||
" 23: fenct TRUE\n"
|
||||
" 24: madr r0.xy(TRUE), -h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz\n"
|
||||
" 26: texpkb h0(TRUE), r0, #0\n"
|
||||
" 28: maxh h5.x(TRUE), h2.w, h5\n"
|
||||
" 29: minh h5.w(TRUE), h5.y, h4\n"
|
||||
" 30: madr r1.xy(TRUE), h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz\n"
|
||||
" 32: texpkb h2(TRUE), r1, #0\n"
|
||||
" 34: addh_d2 h2(TRUE), h0, h2\n"
|
||||
" 35: texpkb h1(TRUE), v4, #0\n"
|
||||
" 37: maxh h5.y(TRUE), h5.x, h1.w\n"
|
||||
" 38: minh h4.w(TRUE), h1, h5\n"
|
||||
" 39: madr r0.xy(TRUE), -h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz\n"
|
||||
" 41: texpkb h0(TRUE), r0, #0\n"
|
||||
" 43: addh_m8 h5.z(TRUE), h5.y, -h4.w\n"
|
||||
" 44: madr r2.xy(TRUE), h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz\n"
|
||||
" 46: texpkb h3(TRUE), r2, #0\n"
|
||||
" 48: addh_d2 h0(TRUE), h0, h3\n"
|
||||
" 49: addh_d2 h3(TRUE), h0, h2\n"
|
||||
" 50: movh h0(TRUE), h3\n"
|
||||
" 51: slth h3.x(TRUE), h3.w, h5.w\n"
|
||||
" 52: sgth h3.w(TRUE), h3, h5.x\n"
|
||||
" 53: addx.c0 rc(TRUE), h3.x, h3\n"
|
||||
" 54: slth.c0 rc(TRUE), h5.z, h5\n"
|
||||
" 55: movh h0(c0.NE.w), h2\n"
|
||||
" 56: movh h0(c0.NE.x), h1\n"
|
||||
"\n"
|
||||
"IPU0 ------ Simplified schedule: --------\n"
|
||||
"Pass | Unit | uOp | PC: Op\n"
|
||||
"-----+--------+------+-------------------------\n"
|
||||
" 1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;\n"
|
||||
" | SCB0 | add | 2: ADDh h2.y, h0.-w--, const.-x--;\n"
|
||||
" | | |\n"
|
||||
" 2 | SCT0/1 | mov | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;\n"
|
||||
" | SCB0 | add | 6: ADDh h0.x, h1.w---,-h2.y---;\n"
|
||||
" | | |\n"
|
||||
" 3 | SCT0/1 | mov | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;\n"
|
||||
" | SCB0 | max | 10: MAXh h5.x, h2.y---, h2.w---;\n"
|
||||
" | SCB1 | min | 9: MINh h4.w, h2.---y, h2;\n"
|
||||
" | | |\n"
|
||||
" 4 | SCT0/1 | mov | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0;\n"
|
||||
" | SCB0 | add | 14: ADDh h0.x, h0.w---, h0;\n"
|
||||
" | SCB1 | add | 13: ADDh h3.w,-h0, h0.---x;\n"
|
||||
" | | |\n"
|
||||
" 5 | SCT0 | mad | 16: ADDh h0.x, h2.w---, h3.w---;\n"
|
||||
" | SCT1 | mad | 15: ADDh h0.z,-h2.--w-, h0.--x-;\n"
|
||||
" | SCB0 | min | 17: MINh h5.y, h0.-w--, h1.-w--;\n"
|
||||
" | | |\n"
|
||||
" 6 | SCT1 | mov | 18: NRMh h2.xz, h0;\n"
|
||||
" | SRB | nrm | 18: NRMh h2.xz, h0;\n"
|
||||
" | SCB1 | min | 19: MINh*8 h2.w, |h2.---x|, |h2.---z|;\n"
|
||||
" | | |\n"
|
||||
" 7 | SCT0 | div | 20: DIVx h4.xy, h2.xz--, h2.ww--;\n"
|
||||
" | SCT1 | mov | 21: MOVr r1.zw, g[TEX0].--xy;\n"
|
||||
" | SCB1 | max | 22: MAXh h2.w, h0, h1;\n"
|
||||
" | | |\n"
|
||||
" 8 | SCT0 | mad | 24: MADr r0.xy,-h2.xz--, const.zw--, r1.zw--;\n"
|
||||
" | SCT1 | mov | 26: TXLr h0, r0, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 26: TXLr h0, r0, const.xxxx, TEX0;\n"
|
||||
" | SCB0 | max | 28: MAXh h5.x, h2.w---, h5;\n"
|
||||
" | SCB1 | min | 29: MINh h5.w, h5.---y, h4;\n"
|
||||
" | | |\n"
|
||||
" 9 | SCT0 | mad | 30: MADr r1.xy, h2.xz--, const.zw--, r1.zw--;\n"
|
||||
" | SCT1 | mov | 32: TXLr h2, r1, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 32: TXLr h2, r1, const.xxxx, TEX0;\n"
|
||||
" | SCB0/1 | add | 34: ADDh/2 h2, h0, h2;\n"
|
||||
" | | |\n"
|
||||
" 10 | SCT0/1 | mov | 35: TXLr h1, g[TEX0], const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 35: TXLr h1, g[TEX0], const.xxxx, TEX0;\n"
|
||||
" | SCB0 | max | 37: MAXh h5.y, h5.-x--, h1.-w--;\n"
|
||||
" | SCB1 | min | 38: MINh h4.w, h1, h5;\n"
|
||||
" | | |\n"
|
||||
" 11 | SCT0 | mad | 39: MADr r0.xy,-h4, const.xy--, r1.zw--;\n"
|
||||
" | SCT1 | mov | 41: TXLr h0, r0, const.zzzz, TEX0;\n"
|
||||
" | TEX | txl | 41: TXLr h0, r0, const.zzzz, TEX0;\n"
|
||||
" | SCB0 | mad | 44: MADr r2.xy, h4, const.xy--, r1.zw--;\n"
|
||||
" | SCB1 | add | 43: ADDh*8 h5.z, h5.--y-,-h4.--w-;\n"
|
||||
" | | |\n"
|
||||
" 12 | SCT0/1 | mov | 46: TXLr h3, r2, const.xxxx, TEX0;\n"
|
||||
" | TEX | txl | 46: TXLr h3, r2, const.xxxx, TEX0;\n"
|
||||
" | SCB0/1 | add | 48: ADDh/2 h0, h0, h3;\n"
|
||||
" | | |\n"
|
||||
" 13 | SCT0/1 | mad | 49: ADDh/2 h3, h0, h2;\n"
|
||||
" | SCB0/1 | mul | 50: MOVh h0, h3;\n"
|
||||
" | | |\n"
|
||||
" 14 | SCT0 | set | 51: SLTh h3.x, h3.w---, h5.w---;\n"
|
||||
" | SCT1 | set | 52: SGTh h3.w, h3, h5.---x;\n"
|
||||
" | SCB0 | set | 54: SLThc0 rc, h5.z---, h5;\n"
|
||||
" | SCB1 | add | 53: ADDxc0_s rc, h3.---x, h3;\n"
|
||||
" | | |\n"
|
||||
" 15 | SCT0/1 | mul | 55: MOVh h0(NE0.wwww), h2;\n"
|
||||
" | SCB0/1 | mul | 56: MOVh h0(NE0.xxxx), h1;\n"
|
||||
" \n"
|
||||
"Pass SCT TEX SCB\n"
|
||||
" 1: 0\% 100\% 25\%\n"
|
||||
" 2: 0\% 100\% 25\%\n"
|
||||
" 3: 0\% 100\% 50\%\n"
|
||||
" 4: 0\% 100\% 50\%\n"
|
||||
" 5: 50\% 0\% 25\%\n"
|
||||
" 6: 0\% 0\% 25\%\n"
|
||||
" 7: 100\% 0\% 25\%\n"
|
||||
" 8: 0\% 100\% 50\%\n"
|
||||
" 9: 0\% 100\% 100\%\n"
|
||||
" 10: 0\% 100\% 50\%\n"
|
||||
" 11: 0\% 100\% 75\%\n"
|
||||
" 12: 0\% 100\% 100\%\n"
|
||||
" 13: 100\% 0\% 100\%\n"
|
||||
" 14: 50\% 0\% 50\%\n"
|
||||
" 15: 100\% 0\% 100\%\n"
|
||||
"\n"
|
||||
"MEAN: 26\% 60\% 56\%\n"
|
||||
"\n"
|
||||
"Pass SCT0 SCT1 TEX SCB0 SCB1\n"
|
||||
" 1: 0\% 0\% 100\% 100\% 0\%\n"
|
||||
" 2: 0\% 0\% 100\% 100\% 0\%\n"
|
||||
" 3: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 4: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 5: 100\% 100\% 0\% 100\% 0\%\n"
|
||||
" 6: 0\% 0\% 0\% 0\% 100\%\n"
|
||||
" 7: 100\% 100\% 0\% 0\% 100\%\n"
|
||||
" 8: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 9: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 10: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 11: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 12: 0\% 0\% 100\% 100\% 100\%\n"
|
||||
" 13: 100\% 100\% 0\% 100\% 100\%\n"
|
||||
" 14: 100\% 100\% 0\% 100\% 100\%\n"
|
||||
" 15: 100\% 100\% 0\% 100\% 100\%\n"
|
||||
"\n"
|
||||
"MEAN: 33\% 33\% 60\% 86\% 80\%\n"
|
||||
"Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5\n"
|
||||
"Results 15 cycles, 3 r regs, 800,000,000 pixels/s\n"
|
||||
"============================================================================*/\n"
|
||||
"#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 1)\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"#pragma disablepc all\n"
|
||||
"#pragma option O2\n"
|
||||
"#pragma option OutColorPrec=fp16\n"
|
||||
"#pragma texformat default RGBA8\n"
|
||||
"/*==========================================================================*/\n"
|
||||
"half4 FxaaPixelShader(\n"
|
||||
" // {xy} = center of pixel\n"
|
||||
" float2 pos,\n"
|
||||
" // {xy__} = upper left of pixel\n"
|
||||
" // {__zw} = lower right of pixel\n"
|
||||
" float4 posPos,\n"
|
||||
" // {rgb_} = color in linear or perceptual color space\n"
|
||||
" // {___a} = luma in perceptual color space (not linear)\n"
|
||||
" sampler2D tex,\n"
|
||||
" // This must be from a constant/uniform.\n"
|
||||
" // {xy} = rcpFrame not used on PS3\n"
|
||||
" float2 rcpFrame,\n"
|
||||
" // This must be from a constant/uniform.\n"
|
||||
" // {x___} = 2.0/screenWidthInPixels\n"
|
||||
" // {_y__} = 2.0/screenHeightInPixels\n"
|
||||
" // {__z_} = 0.5/screenWidthInPixels\n"
|
||||
" // {___w} = 0.5/screenHeightInPixels\n"
|
||||
" float4 rcpFrameOpt\n"
|
||||
") {\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (1)\n"
|
||||
" half4 rgbyNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0));\n"
|
||||
" half lumaNe = rgbyNe.w + half(1.0/512.0);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (2)\n"
|
||||
" half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0));\n"
|
||||
" half lumaSwNegNe = lumaSw.w - lumaNe;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (3)\n"
|
||||
" half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0));\n"
|
||||
" half lumaMaxNwSw = max(lumaNw.w, lumaSw.w);\n"
|
||||
" half lumaMinNwSw = min(lumaNw.w, lumaSw.w);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (4)\n"
|
||||
" half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0));\n"
|
||||
" half dirZ = lumaNw.w + lumaSwNegNe;\n"
|
||||
" half dirX = -lumaNw.w + lumaSwNegNe;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (5)\n"
|
||||
" half3 dir;\n"
|
||||
" dir.y = 0.0;\n"
|
||||
" dir.x = lumaSe.w + dirX;\n"
|
||||
" dir.z = -lumaSe.w + dirZ;\n"
|
||||
" half lumaMinNeSe = min(lumaNe, lumaSe.w);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (6)\n"
|
||||
" half4 dir1_pos;\n"
|
||||
" dir1_pos.xy = normalize(dir).xz;\n"
|
||||
" half dirAbsMinTimes8 = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (7)\n"
|
||||
" half4 dir2_pos;\n"
|
||||
" dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimes8, half(-2.0), half(2.0));\n"
|
||||
" dir1_pos.zw = pos.xy;\n"
|
||||
" dir2_pos.zw = pos.xy;\n"
|
||||
" half lumaMaxNeSe = max(lumaNe, lumaSe.w);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (8)\n"
|
||||
" half4 temp1N;\n"
|
||||
" temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;\n"
|
||||
" temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));\n"
|
||||
" half lumaMax = max(lumaMaxNwSw, lumaMaxNeSe);\n"
|
||||
" half lumaMin = min(lumaMinNwSw, lumaMinNeSe);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (9)\n"
|
||||
" half4 rgby1;\n"
|
||||
" rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;\n"
|
||||
" rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));\n"
|
||||
" rgby1 = (temp1N + rgby1) * 0.5;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (10)\n"
|
||||
" half4 rgbyM = h4tex2Dlod(tex, half4(pos.xy, 0.0, 0.0));\n"
|
||||
" half lumaMaxM = max(lumaMax, rgbyM.w);\n"
|
||||
" half lumaMinM = min(lumaMin, rgbyM.w);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (11)\n"
|
||||
" half4 temp2N;\n"
|
||||
" temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;\n"
|
||||
" temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));\n"
|
||||
" half4 rgby2;\n"
|
||||
" rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;\n"
|
||||
" half lumaRangeM = (lumaMaxM - lumaMinM) / FXAA_CONSOLE_EDGE_THRESHOLD;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (12)\n"
|
||||
" rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));\n"
|
||||
" rgby2 = (temp2N + rgby2) * 0.5;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (13)\n"
|
||||
" rgby2 = (rgby2 + rgby1) * 0.5;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (14)\n"
|
||||
" bool twoTapLt = rgby2.w < lumaMin;\n"
|
||||
" bool twoTapGt = rgby2.w > lumaMax;\n"
|
||||
" bool earlyExit = lumaRangeM < lumaMax;\n"
|
||||
" bool twoTap = twoTapLt || twoTapGt;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"// (15)\n"
|
||||
" if(twoTap) rgby2 = rgby1;\n"
|
||||
" if(earlyExit) rgby2 = rgbyM;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" return rgby2; }\n"
|
||||
"/*==========================================================================*/\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/*============================================================================\n"
|
||||
"\n"
|
||||
" FXAA3 CONSOLE - PC PIXEL SHADER\n"
|
||||
"\n"
|
||||
"------------------------------------------------------------------------------\n"
|
||||
"Using a modified version of the PS3 version here to best target old hardware.\n"
|
||||
"============================================================================*/\n"
|
||||
"#if (FXAA_PC_CONSOLE == 1)\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
"half4 FxaaPixelShader(\n"
|
||||
" // {xy} = center of pixel\n"
|
||||
" float2 pos,\n"
|
||||
" // {xy__} = upper left of pixel\n"
|
||||
" // {__zw} = lower right of pixel\n"
|
||||
" float4 posPos,\n"
|
||||
" // {rgb_} = color in linear or perceptual color space\n"
|
||||
" // {___a} = alpha output is junk value\n"
|
||||
" FxaaTex tex,\n"
|
||||
" // This must be from a constant/uniform.\n"
|
||||
" // {xy} = rcpFrame not used on PC version of FXAA Console\n"
|
||||
" float2 rcpFrame,\n"
|
||||
" // This must be from a constant/uniform.\n"
|
||||
" // {x___} = 2.0/screenWidthInPixels\n"
|
||||
" // {_y__} = 2.0/screenHeightInPixels\n"
|
||||
" // {__z_} = 0.5/screenWidthInPixels\n"
|
||||
" // {___w} = 0.5/screenHeightInPixels\n"
|
||||
" float4 rcpFrameOpt\n"
|
||||
") {\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 dir;\n"
|
||||
" dir.y = 0.0;\n"
|
||||
" half4 lumaNe = FxaaTexTop(tex, posPos.zy);\n"
|
||||
" lumaNe.w += half(1.0/384.0);\n"
|
||||
" dir.x = -lumaNe.w;\n"
|
||||
" dir.z = -lumaNe.w;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 lumaSw = FxaaTexTop(tex, posPos.xw);\n"
|
||||
" dir.x += lumaSw.w;\n"
|
||||
" dir.z += lumaSw.w;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 lumaNw = FxaaTexTop(tex, posPos.xy);\n"
|
||||
" dir.x -= lumaNw.w;\n"
|
||||
" dir.z += lumaNw.w;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 lumaSe = FxaaTexTop(tex, posPos.zw);\n"
|
||||
" dir.x += lumaSe.w;\n"
|
||||
" dir.z -= lumaSe.w;\n"
|
||||
"/*==========================================================================*/\n"
|
||||
" #if (FXAA_EARLY_EXIT == 1)\n"
|
||||
" half4 rgbyM = FxaaTexTop(tex, pos.xy);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));\n"
|
||||
" half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half lumaMinM = min(lumaMin, rgbyM.w);\n"
|
||||
" half lumaMaxM = max(lumaMax, rgbyM.w);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE_EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE_EDGE_THRESHOLD))\n"
|
||||
" #if (FXAA_DISCARD == 1)\n"
|
||||
" FxaaDiscard;\n"
|
||||
" #else\n"
|
||||
" return rgbyM;\n"
|
||||
" #endif\n"
|
||||
" #endif\n"
|
||||
"/*==========================================================================*/\n"
|
||||
" half4 dir1_pos;\n"
|
||||
" dir1_pos.xy = normalize(dir.xyz).xz;\n"
|
||||
" half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE_EDGE_SHARPNESS);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 dir2_pos;\n"
|
||||
" dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));\n"
|
||||
" dir1_pos.zw = pos.xy;\n"
|
||||
" dir2_pos.zw = pos.xy;\n"
|
||||
" half4 temp1N;\n"
|
||||
" temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" temp1N = FxaaTexTop(tex, temp1N.xy);\n"
|
||||
" half4 rgby1;\n"
|
||||
" rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" rgby1 = FxaaTexTop(tex, rgby1.xy);\n"
|
||||
" rgby1 = (temp1N + rgby1) * 0.5;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 temp2N;\n"
|
||||
" temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;\n"
|
||||
" temp2N = FxaaTexTop(tex, temp2N.xy);\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" half4 rgby2;\n"
|
||||
" rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;\n"
|
||||
" rgby2 = FxaaTexTop(tex, rgby2.xy);\n"
|
||||
" rgby2 = (temp2N + rgby2) * 0.5;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" #if (FXAA_EARLY_EXIT == 0)\n"
|
||||
" half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));\n"
|
||||
" half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));\n"
|
||||
" #endif\n"
|
||||
" rgby2 = (rgby2 + rgby1) * 0.5;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" bool twoTapLt = rgby2.w < lumaMin;\n"
|
||||
" bool twoTapGt = rgby2.w > lumaMax;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" if(twoTapLt || twoTapGt) rgby2 = rgby1;\n"
|
||||
"/*--------------------------------------------------------------------------*/\n"
|
||||
" return rgby2; }\n"
|
||||
"/*==========================================================================*/\n"
|
||||
"#endif\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/*============================================================================\n"
|
||||
"\n"
|
||||
" FXAA3 QUALITY - PC\n"
|
||||
"\n"
|
||||
"============================================================================*/\n"
|
||||
|
|
Loading…
Reference in New Issue