pcsx2/plugins/GSdx/res/fxaa.fx

1997 lines
78 KiB
HLSL
Raw Normal View History

#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency
#define FXAA_PC 1
#define FXAA_QUALITY__SUBPIX 0.0
#if SHADER_MODEL >= 0x400
#if SHADER_MODEL >= 0x500
#define FXAA_HLSL_5 1
#else
#define FXAA_HLSL_4 1
#endif
Texture2D Texture;
SamplerState TextureSampler;
cbuffer cb0
{
float4 _rcpFrame;
float4 _rcpFrameOpt;
};
struct PS_INPUT
{
float4 p : SV_Position;
float2 t : TEXCOORD0;
};
struct PS_OUTPUT
{
float4 c : SV_Target0;
};
#elif SHADER_MODEL <= 0x300
#define FXAA_HLSL_3 1
sampler Texture : register(s0);
float4 _rcpFrame : register(c0);
float4 _rcpFrameOpt : register(c1);
struct PS_INPUT
{
#if SHADER_MODEL < 0x300
float4 p : TEXCOORD1;
#else
float4 p : VPOS;
#endif
float2 t : TEXCOORD0;
};
struct PS_OUTPUT
{
float4 c : COLOR;
};
#endif
/*============================================================================
NVIDIA FXAA 3.10 by TIMOTHY LOTTES
------------------------------------------------------------------------------
COPYRIGHT (C) 2010, 2011 NVIDIA CORPORATION. ALL RIGHTS RESERVED.
------------------------------------------------------------------------------
TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THIS SOFTWARE IS PROVIDED
*AS IS* AND NVIDIA AND ITS SUPPLIERS DISCLAIM ALL WARRANTIES, EITHER EXPRESS
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA
OR ITS SUPPLIERS BE LIABLE FOR ANY SPECIAL, INCIDENTAL, INDIRECT, OR
CONSEQUENTIAL DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR
LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION,
OR ANY OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OF OR INABILITY TO USE
THIS SOFTWARE, EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.
------------------------------------------------------------------------------
INTEGRATION CHECKLIST
------------------------------------------------------------------------------
(1.)
In the shader source,
setup defines for the desired configuration.
Example,
#define FXAA_PC 1
#define FXAA_HLSL_3 1
#define FXAA_QUALITY__PRESET 12
#define FXAA_QUALITY__EDGE_THRESHOLD (1.0/6.0)
#define FXAA_QUALITY__EDGE_THRESHOLD_MIN (1.0/12.0)
(2.)
Then include this file,
#include "Fxaa3.h"
(3.)
Then call the FXAA pixel shader from within your desired shader,
return FxaaPixelShader(pos, posPos, tex, rcpFrame, rcpFrameOpt);
(4.)
Insure pass prior to FXAA outputs RGBL.
See next section.
(5.)
Setup engine to provide "rcpFrame" and "rcpFrameOpt" constants.
Not using constants will result in a performance loss.
// {x_} = 1.0/screenWidthInPixels
// {_y} = 1.0/screenHeightInPixels
float2 rcpFrame
// This must be from a constant/uniform.
// {x___} = 2.0/screenWidthInPixels
// {_y__} = 2.0/screenHeightInPixels
// {__z_} = 0.5/screenWidthInPixels
// {___w} = 0.5/screenHeightInPixels
float4 rcpFrameOpt
(5.a.)
Optionally change to this for sharper FXAA Console,
// This must be from a constant/uniform.
// {x___} = 2.0/screenWidthInPixels
// {_y__} = 2.0/screenHeightInPixels
// {__z_} = 0.333/screenWidthInPixels
// {___w} = 0.333/screenHeightInPixels
float4 rcpFrameOpt
(6.)
Have FXAA vertex shader run as a full screen triangle,
and output "pos" and "posPos" such that inputs in the pixel shader provide,
// {xy} = center of pixel
float2 pos,
// {xy__} = upper left of pixel
// {__zw} = lower right of pixel
float4 posPos,
(7.)
Insure the texture sampler used by FXAA is set to bilinear filtering.
------------------------------------------------------------------------------
INTEGRATION - RGBL AND COLORSPACE
------------------------------------------------------------------------------
FXAA3 requires RGBL as input.
RGB should be LDR (low dynamic range).
Specifically do FXAA after tonemapping.
RGB data as returned by a texture fetch can be linear or non-linear.
Note an "sRGB format" texture counts as linear,
because the result of a texture fetch is linear data.
Regular "RGBA8" textures in the sRGB colorspace are non-linear.
Luma must be stored in the alpha channel prior to running FXAA.
This luma should be in a perceptual space (could be gamma 2.0).
Example pass before FXAA where output is gamma 2.0 encoded,
color.rgb = ToneMap(color.rgb); // linear color output
color.rgb = sqrt(color.rgb); // gamma 2.0 color output
return color;
To use FXAA,
color.rgb = ToneMap(color.rgb); // linear color output
color.rgb = sqrt(color.rgb); // gamma 2.0 color output
color.a = dot(color.rgb, float3(0.299, 0.587, 0.114)); // compute luma
return color;
Another example where output is linear encoded,
say for instance writing to an sRGB formated render target,
where the render target does the conversion back to sRGB after blending,
color.rgb = ToneMap(color.rgb); // linear color output
return color;
To use FXAA,
color.rgb = ToneMap(color.rgb); // linear color output
color.a = sqrt(dot(color.rgb, float3(0.299, 0.587, 0.114))); // compute luma
return color;
Getting luma correct is required for the algorithm to work correctly.
------------------------------------------------------------------------------
BEING LINEARLY CORRECT?
------------------------------------------------------------------------------
Applying FXAA to a framebuffer with linear RGB color will look worse.
This is very counter intuitive, but happends to be true in this case.
The reason is because dithering artifacts will be more visiable
in a linear colorspace.
------------------------------------------------------------------------------
COMPLEX INTEGRATION
------------------------------------------------------------------------------
Q. What if the engine is blending into RGB before wanting to run FXAA?
A. In the last opaque pass prior to FXAA,
have the pass write out luma into alpha.
Then blend into RGB only.
FXAA should be able to run ok
assuming the blending pass did not any add aliasing.
This should be the common case for particles and common blending passes.
============================================================================*/
/*============================================================================
INTEGRATION KNOBS
============================================================================*/
//
// FXAA_PS3 and FXAA_360 choose the console algorithm (FXAA3 CONSOLE).
// FXAA_360_OPT is a prototype for the new optimized 360 version.
//
// 1 = Use API.
// 0 = Don't use API.
//
/*--------------------------------------------------------------------------*/
#ifndef FXAA_PS3
#define FXAA_PS3 0
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_360
#define FXAA_360 0
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_360_OPT
#define FXAA_360_OPT 0
#endif
/*==========================================================================*/
#ifndef FXAA_PC
//
// FXAA Quality
// The high quality PC algorithm.
//
#define FXAA_PC 0
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_PC_CONSOLE
//
// The console algorithm for PC is included
// for developers targeting really low spec machines.
//
#define FXAA_PC_CONSOLE 0
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_GLSL_120
#define FXAA_GLSL_120 0
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_GLSL_130
#define FXAA_GLSL_130 0
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_HLSL_3
#define FXAA_HLSL_3 0
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_HLSL_4
#define FXAA_HLSL_4 0
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_HLSL_5
#define FXAA_HLSL_5 0
#endif
/*==========================================================================*/
#ifndef FXAA_EARLY_EXIT
//
// Controls algorithm's early exit path.
// On PS3 turning this on adds 2 cycles to the shader.
// On 360 turning this off adds 10ths of a millisecond to the shader.
// Turning this off on console will result in a more blurry image.
// So this defaults to on.
//
// 1 = On.
// 0 = Off.
//
#define FXAA_EARLY_EXIT 1
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_DISCARD
//
// Only valid for PC OpenGL currently.
//
// 1 = Use discard on pixels which don't need AA.
// For APIs which enable concurrent TEX+ROP from same surface.
// 0 = Return unchanged color on pixels which don't need AA.
//
#define FXAA_DISCARD 0
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_FAST_PIXEL_OFFSET
//
// Used for GLSL 120 only.
//
// 1 = GL API supports fast pixel offsets
// 0 = do not use fast pixel offsets
//
#ifdef GL_EXT_gpu_shader4
#define FXAA_FAST_PIXEL_OFFSET 1
#endif
#ifdef GL_NV_gpu_shader5
#define FXAA_FAST_PIXEL_OFFSET 1
#endif
#ifdef GL_ARB_gpu_shader5
#define FXAA_FAST_PIXEL_OFFSET 1
#endif
#ifndef FXAA_FAST_PIXEL_OFFSET
#define FXAA_FAST_PIXEL_OFFSET 0
#endif
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_GATHER4_ALPHA
//
// 1 = API supports gather4 on alpha channel.
// 0 = API does not support gather4 on alpha channel.
//
#if (FXAA_HLSL_5 == 1)
#define FXAA_GATHER4_ALPHA 1
#endif
#ifdef GL_ARB_gpu_shader5
#define FXAA_GATHER4_ALPHA 1
#endif
#ifdef GL_NV_gpu_shader5
#define FXAA_GATHER4_ALPHA 1
#endif
#ifndef FXAA_GATHER4_ALPHA
#define FXAA_GATHER4_ALPHA 0
#endif
#endif
/*============================================================================
FXAA CONSOLE - TUNING KNOBS
============================================================================*/
#ifndef FXAA_CONSOLE__EDGE_SHARPNESS
//
// Consoles the sharpness of edges.
//
// Due to the PS3 being ALU bound,
// there are only two safe values here: 4 and 8.
// These options use the shaders ability to a free *|/ by 4|8.
//
// 8.0 is sharper
// 4.0 is softer
// 2.0 is really soft (good for vector graphics inputs)
//
#if 1
#define FXAA_CONSOLE__EDGE_SHARPNESS 8.0
#endif
#if 0
#define FXAA_CONSOLE__EDGE_SHARPNESS 4.0
#endif
#if 0
#define FXAA_CONSOLE__EDGE_SHARPNESS 2.0
#endif
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_CONSOLE__EDGE_THRESHOLD
//
// The minimum amount of local contrast required to apply algorithm.
// The console setting has a different mapping than the quality setting.
//
// This only applies when FXAA_EARLY_EXIT is 1.
//
// Due to the PS3 being ALU bound,
// there are only two safe values here: 0.25 and 0.125.
// These options use the shaders ability to a free *|/ by 4|8.
//
// 0.125 leaves less aliasing, but is softer
// 0.25 leaves more aliasing, and is sharper
//
#if 1
#define FXAA_CONSOLE__EDGE_THRESHOLD 0.125
#else
#define FXAA_CONSOLE__EDGE_THRESHOLD 0.25
#endif
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_CONSOLE__EDGE_THRESHOLD_MIN
//
// Trims the algorithm from processing darks.
// The console setting has a different mapping than the quality setting.
//
// This only applies when FXAA_EARLY_EXIT is 1.
//
// This does not apply to PS3.
// PS3 was simplified to avoid more shader instructions.
//
#define FXAA_CONSOLE__EDGE_THRESHOLD_MIN 0.05
#endif
/*============================================================================
FXAA QUALITY - TUNING KNOBS
============================================================================*/
#ifndef FXAA_QUALITY__EDGE_THRESHOLD
//
// The minimum amount of local contrast required to apply algorithm.
//
// 1/3 - too little
// 1/4 - low quality
// 1/6 - default
// 1/8 - high quality (default)
// 1/16 - overkill
//
#define FXAA_QUALITY__EDGE_THRESHOLD (1.0/6.0)
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_QUALITY__EDGE_THRESHOLD_MIN
//
// Trims the algorithm from processing darks.
//
// 1/32 - visible limit
// 1/16 - high quality
// 1/12 - upper limit (default, the start of visible unfiltered edges)
//
#define FXAA_QUALITY__EDGE_THRESHOLD_MIN (1.0/12.0)
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_QUALITY__SUBPIX
//
// Choose the amount of sub-pixel aliasing removal.
//
// 1 - upper limit (softer)
// 3/4 - default amount of filtering
// 1/2 - lower limit (sharper, less sub-pixel aliasing removal)
//
#define FXAA_QUALITY__SUBPIX (3.0/4.0)
#endif
/*--------------------------------------------------------------------------*/
#ifndef FXAA_QUALITY__PRESET
//
// Choose the quality preset.
//
// OPTIONS
// -----------------------------------------------------------------------
// 10 to 15 - default medium dither (10=fastest, 15=highest quality)
// 20 to 29 - less dither, more expensive (20=fastest, 29=highest quality)
// 39 - no dither, very expensive
//
// NOTES
// -----------------------------------------------------------------------
// 12 = slightly faster then FXAA 3.9 and higher edge quality (default)
// 13 = about same speed as FXAA 3.9 and better than 12
// 23 = closest to FXAA 3.9 visually and performance wise
// _ = the lowest digit is directly related to performance
// _ = the highest digit is directly related to style
//
#define FXAA_QUALITY__PRESET 12
#endif
/*============================================================================
FXAA QUALITY - PRESETS
============================================================================*/
/*============================================================================
FXAA QUALITY - MEDIUM DITHER PRESETS
============================================================================*/
#if (FXAA_QUALITY__PRESET == 10)
#define FXAA_QUALITY__PS 3
#define FXAA_QUALITY__P0 1.5
#define FXAA_QUALITY__P1 3.0
#define FXAA_QUALITY__P2 12.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 11)
#define FXAA_QUALITY__PS 4
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 3.0
#define FXAA_QUALITY__P3 12.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 12)
#define FXAA_QUALITY__PS 5
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 4.0
#define FXAA_QUALITY__P4 12.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 13)
#define FXAA_QUALITY__PS 6
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 4.0
#define FXAA_QUALITY__P5 12.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 14)
#define FXAA_QUALITY__PS 7
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 2.0
#define FXAA_QUALITY__P5 4.0
#define FXAA_QUALITY__P6 12.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 15)
#define FXAA_QUALITY__PS 8
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 2.0
#define FXAA_QUALITY__P5 2.0
#define FXAA_QUALITY__P6 4.0
#define FXAA_QUALITY__P7 12.0
#endif
/*============================================================================
FXAA QUALITY - LOW DITHER PRESETS
============================================================================*/
#if (FXAA_QUALITY__PRESET == 20)
#define FXAA_QUALITY__PS 3
#define FXAA_QUALITY__P0 1.5
#define FXAA_QUALITY__P1 2.0
#define FXAA_QUALITY__P2 8.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 21)
#define FXAA_QUALITY__PS 4
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 8.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 22)
#define FXAA_QUALITY__PS 5
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 8.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 23)
#define FXAA_QUALITY__PS 6
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 2.0
#define FXAA_QUALITY__P5 8.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 24)
#define FXAA_QUALITY__PS 7
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 2.0
#define FXAA_QUALITY__P5 3.0
#define FXAA_QUALITY__P6 8.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 25)
#define FXAA_QUALITY__PS 8
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 2.0
#define FXAA_QUALITY__P5 2.0
#define FXAA_QUALITY__P6 4.0
#define FXAA_QUALITY__P7 8.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 26)
#define FXAA_QUALITY__PS 9
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 2.0
#define FXAA_QUALITY__P5 2.0
#define FXAA_QUALITY__P6 2.0
#define FXAA_QUALITY__P7 4.0
#define FXAA_QUALITY__P8 8.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 27)
#define FXAA_QUALITY__PS 10
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 2.0
#define FXAA_QUALITY__P5 2.0
#define FXAA_QUALITY__P6 2.0
#define FXAA_QUALITY__P7 2.0
#define FXAA_QUALITY__P8 4.0
#define FXAA_QUALITY__P9 8.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 28)
#define FXAA_QUALITY__PS 11
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 2.0
#define FXAA_QUALITY__P5 2.0
#define FXAA_QUALITY__P6 2.0
#define FXAA_QUALITY__P7 2.0
#define FXAA_QUALITY__P8 2.0
#define FXAA_QUALITY__P9 4.0
#define FXAA_QUALITY__P10 8.0
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PRESET == 29)
#define FXAA_QUALITY__PS 12
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.5
#define FXAA_QUALITY__P2 2.0
#define FXAA_QUALITY__P3 2.0
#define FXAA_QUALITY__P4 2.0
#define FXAA_QUALITY__P5 2.0
#define FXAA_QUALITY__P6 2.0
#define FXAA_QUALITY__P7 2.0
#define FXAA_QUALITY__P8 2.0
#define FXAA_QUALITY__P9 2.0
#define FXAA_QUALITY__P10 4.0
#define FXAA_QUALITY__P11 8.0
#endif
/*============================================================================
FXAA QUALITY - EXTREME QUALITY
============================================================================*/
#if (FXAA_QUALITY__PRESET == 39)
#define FXAA_QUALITY__PS 12
#define FXAA_QUALITY__P0 1.0
#define FXAA_QUALITY__P1 1.0
#define FXAA_QUALITY__P2 1.0
#define FXAA_QUALITY__P3 1.0
#define FXAA_QUALITY__P4 1.0
#define FXAA_QUALITY__P5 1.5
#define FXAA_QUALITY__P6 2.0
#define FXAA_QUALITY__P7 2.0
#define FXAA_QUALITY__P8 2.0
#define FXAA_QUALITY__P9 2.0
#define FXAA_QUALITY__P10 4.0
#define FXAA_QUALITY__P11 8.0
#endif
/*============================================================================
API PORTING
============================================================================*/
#if FXAA_GLSL_120
// Requires,
// #version 120
// And at least,
// #extension GL_EXT_gpu_shader4 : enable
// (or set FXAA_FAST_PIXEL_OFFSET 1 to work like DX9)
#define half float
#define half2 vec2
#define half3 vec3
#define half4 vec4
#define int2 ivec2
#define float2 vec2
#define float3 vec3
#define float4 vec4
#define FxaaInt2 ivec2
#define FxaaFloat2 vec2
#define FxaaFloat3 vec3
#define FxaaFloat4 vec4
#define FxaaDiscard discard
#define FxaaDot3(a, b) dot(a, b)
#define FxaaSat(x) clamp(x, 0.0, 1.0)
#define FxaaLerp(x,y,s) mix(x,y,s)
#define FxaaTex sampler2D
#define FxaaTexTop(t, p) texture2DLod(t, p, 0.0)
#if (FXAA_FAST_PIXEL_OFFSET == 1)
#define FxaaTexOff(t, p, o, r) texture2DLodOffset(t, p, 0.0, o)
#else
#define FxaaTexOff(t, p, o, r) texture2DLod(t, p + (o * r), 0.0)
#endif
#if (FXAA_GATHER4_ALPHA == 1)
// use #extension GL_ARB_gpu_shader5 : enable
#define FxaaTexAlpha4(t, p, r) textureGather(t, p, 3)
#define FxaaTexOffAlpha4(t, p, o, r) textureGatherOffset(t, p, o, 3)
#endif
#endif
/*--------------------------------------------------------------------------*/
#if FXAA_GLSL_130
// Requires "#version 130" or better
#define half float
#define half2 vec2
#define half3 vec3
#define half4 vec4
#define int2 ivec2
#define float2 vec2
#define float3 vec3
#define float4 vec4
#define FxaaInt2 ivec2
#define FxaaFloat2 vec2
#define FxaaFloat3 vec3
#define FxaaFloat4 vec4
#define FxaaDiscard discard
#define FxaaDot3(a, b) dot(a, b)
#define FxaaSat(x) clamp(x, 0.0, 1.0)
#define FxaaLerp(x,y,s) mix(x,y,s)
#define FxaaTex sampler2D
#define FxaaTexTop(t, p) textureLod(t, p, 0.0)
#define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o)
#if (FXAA_GATHER4_ALPHA == 1)
// use #extension GL_ARB_gpu_shader5 : enable
#define FxaaTexAlpha4(t, p, r) textureGather(t, p, 3)
#define FxaaTexOffAlpha4(t, p, o, r) textureGatherOffset(t, p, o, 3)
#endif
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_HLSL_3 == 1) || (FXAA_360 == 1)
#define int2 float2
#define FxaaInt2 float2
#define FxaaFloat2 float2
#define FxaaFloat3 float3
#define FxaaFloat4 float4
#define FxaaDiscard clip(-1)
#define FxaaDot3(a, b) dot(a, b)
#define FxaaSat(x) saturate(x)
#define FxaaLerp(x,y,s) lerp(x,y,s)
#define FxaaTex sampler2D
#define FxaaTexTop(t, p) tex2Dlod(t, float4(p, 0.0, 0.0))
#define FxaaTexOff(t, p, o, r) tex2Dlod(t, float4(p + (o * r), 0, 0))
#endif
/*--------------------------------------------------------------------------*/
#if FXAA_HLSL_4
#define FxaaInt2 int2
#define FxaaFloat2 float2
#define FxaaFloat3 float3
#define FxaaFloat4 float4
#define FxaaDiscard clip(-1)
#define FxaaDot3(a, b) dot(a, b)
#define FxaaSat(x) saturate(x)
#define FxaaLerp(x,y,s) lerp(x,y,s)
struct FxaaTex { SamplerState smpl; Texture2D tex; };
#define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0)
#define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o)
#endif
/*--------------------------------------------------------------------------*/
#if FXAA_HLSL_5
#define FxaaInt2 int2
#define FxaaFloat2 float2
#define FxaaFloat3 float3
#define FxaaFloat4 float4
#define FxaaDiscard clip(-1)
#define FxaaDot3(a, b) dot(a, b)
#define FxaaSat(x) saturate(x)
#define FxaaLerp(x,y,s) lerp(x,y,s)
struct FxaaTex { SamplerState smpl; Texture2D tex; };
#define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0)
#define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o)
#define FxaaTexAlpha4(t, p, r) t.tex.GatherAlpha(t.smpl, p)
#define FxaaTexOffAlpha4(t, p, o, r) t.tex.GatherAlpha(t.smpl, p, o)
#endif
/*============================================================================
FXAA3 CONSOLE - 360 PIXEL SHADER
------------------------------------------------------------------------------
Might be some optimizations left here,
as of this latest change didn't have a PIX dump to verify if TEX bound.
============================================================================*/
#if (FXAA_360 == 1)
/*--------------------------------------------------------------------------*/
half4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// {xy__} = upper left of pixel
// {__zw} = lower right of pixel
float4 posPos,
// {rgb_} = color in linear or perceptual color space
// {___a} = alpha output is junk value
FxaaTex tex,
// This must be from a constant/uniform.
// {xy} = rcpFrame not used on PC version of FXAA Console
float2 rcpFrame,
// This must be from a constant/uniform.
// {x___} = 2.0/screenWidthInPixels
// {_y__} = 2.0/screenHeightInPixels
// {__z_} = 0.5/screenWidthInPixels
// {___w} = 0.5/screenHeightInPixels
float4 rcpFrameOpt
) {
/*--------------------------------------------------------------------------*/
half4 lumaNwNeSwSe;
lumaNwNeSwSe.x = FxaaTexTop(tex, posPos.xy).w;
lumaNwNeSwSe.y = FxaaTexTop(tex, posPos.zy).w;
lumaNwNeSwSe.z = FxaaTexTop(tex, posPos.xw).w;
lumaNwNeSwSe.w = FxaaTexTop(tex, posPos.zw).w;
/*--------------------------------------------------------------------------*/
half4 rgbyM = FxaaTexTop(tex, pos.xy);
/*--------------------------------------------------------------------------*/
lumaNwNeSwSe.y += 1.0/384.0;
/*--------------------------------------------------------------------------*/
half2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
half2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
/*--------------------------------------------------------------------------*/
half lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);
half lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);
/*--------------------------------------------------------------------------*/
half lumaMinM = min(lumaMin, rgbyM.w);
half lumaMaxM = max(lumaMax, rgbyM.w);
if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE__EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE__EDGE_THRESHOLD)) return rgbyM;
/*--------------------------------------------------------------------------*/
half2 dir;
dir.x = dot(lumaNwNeSwSe, float4(-1.0, -1.0, 1.0, 1.0));
dir.y = dot(lumaNwNeSwSe, float4( 1.0, -1.0, 1.0,-1.0));
/*--------------------------------------------------------------------------*/
half2 dir1;
dir1 = normalize(dir.xy);
/*--------------------------------------------------------------------------*/
half dirAbsMinTimesC = min(abs(dir1.x), abs(dir1.y)) * FXAA_CONSOLE__EDGE_SHARPNESS;
half2 dir2;
dir2 = clamp(dir1.xy / dirAbsMinTimesC, -2.0, 2.0);
/*--------------------------------------------------------------------------*/
half4 rgbyN1 = FxaaTexTop(tex, pos.xy - dir1 * rcpFrameOpt.zw);
half4 rgbyP1 = FxaaTexTop(tex, pos.xy + dir1 * rcpFrameOpt.zw);
half4 rgbyN2 = FxaaTexTop(tex, pos.xy - dir2 * rcpFrameOpt.xy);
half4 rgbyP2 = FxaaTexTop(tex, pos.xy + dir2 * rcpFrameOpt.xy);
/*--------------------------------------------------------------------------*/
half4 rgbyA = rgbyN1 * 0.5 + rgbyP1 * 0.5;
half4 rgbyB = rgbyN2 * 0.25 + rgbyP2 * 0.25 + rgbyA * 0.5;
/*--------------------------------------------------------------------------*/
bool twoTap = (rgbyB.w < lumaMin) || (rgbyB.w > lumaMax);
if(twoTap) rgbyB.xyz = rgbyA.xyz;
return rgbyB; }
/*==========================================================================*/
#endif
/*============================================================================
FXAA3 CONSOLE - 360 PIXEL SHADER OPTIMIZED PROTOTYPE
------------------------------------------------------------------------------
This prototype optimized version thanks to suggestions from Andy Luedke.
Should be fully tex bound in all cases.
As of the FXAA 3.10 release I have not tested this code,
but at least the missing ";" was fixed.
If it does not work, please let me know so I can fix it.
------------------------------------------------------------------------------
Extra requirements,
(1.) Different inputs: no posPos.
(2.) Different inputs: alias three samplers with different exp bias settings!
(3.) New constants: setup fxaaConst as described below.
============================================================================*/
#if (FXAA_360_OPT == 1)
/*--------------------------------------------------------------------------*/
[reduceTempRegUsage(4)]
float4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// Three samplers,
// texExpBias0 = exponent bias 0
// texExpBiasNeg1 = exponent bias -1
// texExpBiasNeg2 = exponent bias -2
// {rgb_} = color in linear or perceptual color space
// {___a} = alpha output is junk value
uniform sampler2D texExpBias0,
uniform sampler2D texExpBiasNeg1,
uniform sampler2D texExpBiasNeg2,
// These must be in physical constant registers and NOT immedates
// Immedates will result in compiler un-optimizing
// width = screen width in pixels
// height = screen height in pixels
fxaaConstDir, // float4(1.0, -1.0, 0.25, -0.25);
fxaaConstInner, // float4(0.5/width, 0.5/height, -0.5/width, -0.5/height);
fxaaConstOuter // float4(8.0/width, 8.0/height, -4.0/width, -4.0/height);
) {
/*--------------------------------------------------------------------------*/
float4 lumaNwNeSwSe;
asm {
tfetch2D lumaNwNeSwSe.w___, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false
tfetch2D lumaNwNeSwSe._w__, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false
tfetch2D lumaNwNeSwSe.__w_, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false
tfetch2D lumaNwNeSwSe.___w, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false
};
/*--------------------------------------------------------------------------*/
lumaNwNeSwSe.y += 1.0/384.0;
float2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
float2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw);
float lumaMin = min(lumaMinTemp.x, lumaMinTemp.y);
float lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y);
/*--------------------------------------------------------------------------*/
float4 rgbyM = tex2Dlod(texExpBias0, float4(pos.xy, 0.0, 0.0));
float4 lumaMinM = min(lumaMin, rgbyM.w);
float4 lumaMaxM = max(lumaMax, rgbyM.w);
if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE__EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE__EDGE_THRESHOLD)) return rgbyM;
/*--------------------------------------------------------------------------*/
float2 dir;
dir.x = dot(lumaNwNeSwSe, fxaaConstDir.yyxx);
dir.y = dot(lumaNwNeSwSe, fxaaConstDir.xyxy);
dir = normalize(dir);
/*--------------------------------------------------------------------------*/
float4 dir1 = dir.xyxy * fxaaConstInner.xyzw;
/*--------------------------------------------------------------------------*/
float4 dir2;
float dirAbsMinTimesC = min(abs(dir.x), abs(dir.y));
dir2 = saturate(fxaaConstOuter.zzww * dir.xyxy / FXAA_CONSOLE__EDGE_SHARPNESS / dirAbsMinTimesC + 0.5);
dir2 = dir2 * fxaaConstOuter.xyxy + fxaaConstOuter.zwzw;
/*--------------------------------------------------------------------------*/
float4 rgbyN1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.xy, 0.0, 0.0));
float4 rgbyP1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.zw, 0.0, 0.0));
float4 rgbyN2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.xy, 0.0, 0.0));
float4 rgbyP2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.zw, 0.0, 0.0));
/*--------------------------------------------------------------------------*/
half4 rgbyA = rgbyN1 + rgbyP1;
half4 rgbyB = rgbyN2 + rgbyP2 * 0.5 + rgbyA;
/*--------------------------------------------------------------------------*/
float4 rgbyR = ((rgbyB.w - lumaMax) > 0.0) ? rgbyA : rgbyB;
rgbyR = ((rgbyB.w - lumaMin) > 0.0) ? rgbyR : rgbyA;
return rgbyR; }
/*==========================================================================*/
#endif
/*============================================================================
FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (NO EARLY EXIT)
==============================================================================
The code below does not exactly match the assembly.
I have a feeling that 12 cycles is possible, but was not able to get there.
Might have to increase register count to get full performance.
Note this shader does not use perspective interpolation.
Use the following cgc options,
--fenable-bx2 --fastmath --fastprecision --nofloatbindings
------------------------------------------------------------------------------
NVSHADERPERF OUTPUT
------------------------------------------------------------------------------
For reference and to aid in debug, output of NVShaderPerf should match this,
Shader to schedule:
0: texpkb h0.w(TRUE), v5.zyxx, #0
2: addh h2.z(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x
4: texpkb h0.w(TRUE), v5.xwxx, #0
6: addh h0.z(TRUE), -h2, h0.w
7: texpkb h1.w(TRUE), v5, #0
9: addh h0.x(TRUE), h0.z, -h1.w
10: addh h3.w(TRUE), h0.z, h1
11: texpkb h2.w(TRUE), v5.zwzz, #0
13: addh h0.z(TRUE), h3.w, -h2.w
14: addh h0.x(TRUE), h2.w, h0
15: nrmh h1.xz(TRUE), h0_n
16: minh_m8 h0.x(TRUE), |h1|, |h1.z|
17: maxh h4.w(TRUE), h0, h1
18: divx h2.xy(TRUE), h1_n.xzzw, h0_n
19: movr r1.zw(TRUE), v4.xxxy
20: madr r2.xz(TRUE), -h1, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zzww, r1.zzww
22: minh h5.w(TRUE), h0, h1
23: texpkb h0(TRUE), r2.xzxx, #0
25: madr r0.zw(TRUE), h1.xzxz, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w), r1
27: maxh h4.x(TRUE), h2.z, h2.w
28: texpkb h1(TRUE), r0.zwzz, #0
30: addh_d2 h1(TRUE), h0, h1
31: madr r0.xy(TRUE), -h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
33: texpkb h0(TRUE), r0, #0
35: minh h4.z(TRUE), h2, h2.w
36: fenct TRUE
37: madr r1.xy(TRUE), h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
39: texpkb h2(TRUE), r1, #0
41: addh_d2 h0(TRUE), h0, h2
42: maxh h2.w(TRUE), h4, h4.x
43: minh h2.x(TRUE), h5.w, h4.z
44: addh_d2 h0(TRUE), h0, h1
45: slth h2.x(TRUE), h0.w, h2
46: sgth h2.w(TRUE), h0, h2
47: movh h0(TRUE), h0
48: addx.c0 rc(TRUE), h2, h2.w
49: movh h0(c0.NE.x), h1
IPU0 ------ Simplified schedule: --------
Pass | Unit | uOp | PC: Op
-----+--------+------+-------------------------
1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
| TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
| SCB1 | add | 2: ADDh h2.z, h0.--w-, const.--x-;
| | |
2 | SCT0/1 | mov | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;
| TEX | txl | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0;
| SCB1 | add | 6: ADDh h0.z,-h2, h0.--w-;
| | |
3 | SCT0/1 | mov | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0;
| TEX | txl | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0;
| SCB0 | add | 9: ADDh h0.x, h0.z---,-h1.w---;
| SCB1 | add | 10: ADDh h3.w, h0.---z, h1;
| | |
4 | SCT0/1 | mov | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
| TEX | txl | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
| SCB0 | add | 14: ADDh h0.x, h2.w---, h0;
| SCB1 | add | 13: ADDh h0.z, h3.--w-,-h2.--w-;
| | |
5 | SCT1 | mov | 15: NRMh h1.xz, h0;
| SRB | nrm | 15: NRMh h1.xz, h0;
| SCB0 | min | 16: MINh*8 h0.x, |h1|, |h1.z---|;
| SCB1 | max | 17: MAXh h4.w, h0, h1;
| | |
6 | SCT0 | div | 18: DIVx h2.xy, h1.xz--, h0;
| SCT1 | mov | 19: MOVr r1.zw, g[TEX0].--xy;
| SCB0 | mad | 20: MADr r2.xz,-h1, const.z-w-, r1.z-w-;
| SCB1 | min | 22: MINh h5.w, h0, h1;
| | |
7 | SCT0/1 | mov | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0;
| TEX | txl | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0;
| SCB0 | max | 27: MAXh h4.x, h2.z---, h2.w---;
| SCB1 | mad | 25: MADr r0.zw, h1.--xz, const, r1;
| | |
8 | SCT0/1 | mov | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0;
| TEX | txl | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0;
| SCB0/1 | add | 30: ADDh/2 h1, h0, h1;
| | |
9 | SCT0 | mad | 31: MADr r0.xy,-h2, const.xy--, r1.zw--;
| SCT1 | mov | 33: TXLr h0, r0, const.zzzz, TEX0;
| TEX | txl | 33: TXLr h0, r0, const.zzzz, TEX0;
| SCB1 | min | 35: MINh h4.z, h2, h2.--w-;
| | |
10 | SCT0 | mad | 37: MADr r1.xy, h2, const.xy--, r1.zw--;
| SCT1 | mov | 39: TXLr h2, r1, const.zzzz, TEX0;
| TEX | txl | 39: TXLr h2, r1, const.zzzz, TEX0;
| SCB0/1 | add | 41: ADDh/2 h0, h0, h2;
| | |
11 | SCT0 | min | 43: MINh h2.x, h5.w---, h4.z---;
| SCT1 | max | 42: MAXh h2.w, h4, h4.---x;
| SCB0/1 | add | 44: ADDh/2 h0, h0, h1;
| | |
12 | SCT0 | set | 45: SLTh h2.x, h0.w---, h2;
| SCT1 | set | 46: SGTh h2.w, h0, h2;
| SCB0/1 | mul | 47: MOVh h0, h0;
| | |
13 | SCT0 | mad | 48: ADDxc0_s rc, h2, h2.w---;
| SCB0/1 | mul | 49: MOVh h0(NE0.xxxx), h1;
Pass SCT TEX SCB
1: 0% 100% 25%
2: 0% 100% 25%
3: 0% 100% 50%
4: 0% 100% 50%
5: 0% 0% 50%
6: 100% 0% 75%
7: 0% 100% 75%
8: 0% 100% 100%
9: 0% 100% 25%
10: 0% 100% 100%
11: 50% 0% 100%
12: 50% 0% 100%
13: 25% 0% 100%
MEAN: 17% 61% 67%
Pass SCT0 SCT1 TEX SCB0 SCB1
1: 0% 0% 100% 0% 100%
2: 0% 0% 100% 0% 100%
3: 0% 0% 100% 100% 100%
4: 0% 0% 100% 100% 100%
5: 0% 0% 0% 100% 100%
6: 100% 100% 0% 100% 100%
7: 0% 0% 100% 100% 100%
8: 0% 0% 100% 100% 100%
9: 0% 0% 100% 0% 100%
10: 0% 0% 100% 100% 100%
11: 100% 100% 0% 100% 100%
12: 100% 100% 0% 100% 100%
13: 100% 0% 0% 100% 100%
MEAN: 30% 23% 61% 76% 100%
Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5
Results 13 cycles, 3 r regs, 923,076,923 pixels/s
============================================================================*/
#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 0)
/*--------------------------------------------------------------------------*/
#pragma disablepc all
#pragma option O3
#pragma option OutColorPrec=fp16
#pragma texformat default RGBA8
/*==========================================================================*/
half4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// {xy__} = upper left of pixel
// {__zw} = lower right of pixel
float4 posPos,
// {rgb_} = color in linear or perceptual color space
// {___a} = luma in perceptual color space (not linear)
sampler2D tex,
// This must be from a constant/uniform.
// {xy} = rcpFrame not used on PS3
float2 rcpFrame,
// This must be from a constant/uniform.
// {x___} = 2.0/screenWidthInPixels
// {_y__} = 2.0/screenHeightInPixels
// {__z_} = 0.5/screenWidthInPixels
// {___w} = 0.5/screenHeightInPixels
float4 rcpFrameOpt
) {
/*--------------------------------------------------------------------------*/
// (1)
half4 dir;
half4 lumaNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0));
lumaNe.w += half(1.0/512.0);
dir.x = -lumaNe.w;
dir.z = -lumaNe.w;
/*--------------------------------------------------------------------------*/
// (2)
half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0));
dir.x += lumaSw.w;
dir.z += lumaSw.w;
/*--------------------------------------------------------------------------*/
// (3)
half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0));
dir.x -= lumaNw.w;
dir.z += lumaNw.w;
/*--------------------------------------------------------------------------*/
// (4)
half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0));
dir.x += lumaSe.w;
dir.z -= lumaSe.w;
/*--------------------------------------------------------------------------*/
// (5)
half4 dir1_pos;
dir1_pos.xy = normalize(dir.xyz).xz;
half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__EDGE_SHARPNESS);
/*--------------------------------------------------------------------------*/
// (6)
half4 dir2_pos;
dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));
dir1_pos.zw = pos.xy;
dir2_pos.zw = pos.xy;
half4 temp1N;
temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;
/*--------------------------------------------------------------------------*/
// (7)
temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));
half4 rgby1;
rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;
/*--------------------------------------------------------------------------*/
// (8)
rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));
rgby1 = (temp1N + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
// (9)
half4 temp2N;
temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;
temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));
/*--------------------------------------------------------------------------*/
// (10)
half4 rgby2;
rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;
rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));
rgby2 = (temp2N + rgby2) * 0.5;
/*--------------------------------------------------------------------------*/
// (11)
// compilier moves these scalar ops up to other cycles
half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));
half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));
rgby2 = (rgby2 + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
// (12)
bool twoTapLt = rgby2.w < lumaMin;
bool twoTapGt = rgby2.w > lumaMax;
/*--------------------------------------------------------------------------*/
// (13)
if(twoTapLt || twoTapGt) rgby2 = rgby1;
/*--------------------------------------------------------------------------*/
return rgby2; }
/*==========================================================================*/
#endif
/*============================================================================
FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (WITH EARLY EXIT)
==============================================================================
The code mostly matches the assembly.
I have a feeling that 14 cycles is possible, but was not able to get there.
Might have to increase register count to get full performance.
Note this shader does not use perspective interpolation.
Use the following cgc options,
--fenable-bx2 --fastmath --fastprecision --nofloatbindings
------------------------------------------------------------------------------
NVSHADERPERF OUTPUT
------------------------------------------------------------------------------
For reference and to aid in debug, output of NVShaderPerf should match this,
Shader to schedule:
0: texpkb h0.w(TRUE), v5.zyxx, #0
2: addh h2.y(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x
4: texpkb h1.w(TRUE), v5.xwxx, #0
6: addh h0.x(TRUE), h1.w, -h2.y
7: texpkb h2.w(TRUE), v5.zwzz, #0
9: minh h4.w(TRUE), h2.y, h2
10: maxh h5.x(TRUE), h2.y, h2.w
11: texpkb h0.w(TRUE), v5, #0
13: addh h3.w(TRUE), -h0, h0.x
14: addh h0.x(TRUE), h0.w, h0
15: addh h0.z(TRUE), -h2.w, h0.x
16: addh h0.x(TRUE), h2.w, h3.w
17: minh h5.y(TRUE), h0.w, h1.w
18: nrmh h2.xz(TRUE), h0_n
19: minh_m8 h2.w(TRUE), |h2.x|, |h2.z|
20: divx h4.xy(TRUE), h2_n.xzzw, h2_n.w
21: movr r1.zw(TRUE), v4.xxxy
22: maxh h2.w(TRUE), h0, h1
23: fenct TRUE
24: madr r0.xy(TRUE), -h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz
26: texpkb h0(TRUE), r0, #0
28: maxh h5.x(TRUE), h2.w, h5
29: minh h5.w(TRUE), h5.y, h4
30: madr r1.xy(TRUE), h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz
32: texpkb h2(TRUE), r1, #0
34: addh_d2 h2(TRUE), h0, h2
35: texpkb h1(TRUE), v4, #0
37: maxh h5.y(TRUE), h5.x, h1.w
38: minh h4.w(TRUE), h1, h5
39: madr r0.xy(TRUE), -h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
41: texpkb h0(TRUE), r0, #0
43: addh_m8 h5.z(TRUE), h5.y, -h4.w
44: madr r2.xy(TRUE), h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz
46: texpkb h3(TRUE), r2, #0
48: addh_d2 h0(TRUE), h0, h3
49: addh_d2 h3(TRUE), h0, h2
50: movh h0(TRUE), h3
51: slth h3.x(TRUE), h3.w, h5.w
52: sgth h3.w(TRUE), h3, h5.x
53: addx.c0 rc(TRUE), h3.x, h3
54: slth.c0 rc(TRUE), h5.z, h5
55: movh h0(c0.NE.w), h2
56: movh h0(c0.NE.x), h1
IPU0 ------ Simplified schedule: --------
Pass | Unit | uOp | PC: Op
-----+--------+------+-------------------------
1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
| TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0;
| SCB0 | add | 2: ADDh h2.y, h0.-w--, const.-x--;
| | |
2 | SCT0/1 | mov | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;
| TEX | txl | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0;
| SCB0 | add | 6: ADDh h0.x, h1.w---,-h2.y---;
| | |
3 | SCT0/1 | mov | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
| TEX | txl | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0;
| SCB0 | max | 10: MAXh h5.x, h2.y---, h2.w---;
| SCB1 | min | 9: MINh h4.w, h2.---y, h2;
| | |
4 | SCT0/1 | mov | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0;
| TEX | txl | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0;
| SCB0 | add | 14: ADDh h0.x, h0.w---, h0;
| SCB1 | add | 13: ADDh h3.w,-h0, h0.---x;
| | |
5 | SCT0 | mad | 16: ADDh h0.x, h2.w---, h3.w---;
| SCT1 | mad | 15: ADDh h0.z,-h2.--w-, h0.--x-;
| SCB0 | min | 17: MINh h5.y, h0.-w--, h1.-w--;
| | |
6 | SCT1 | mov | 18: NRMh h2.xz, h0;
| SRB | nrm | 18: NRMh h2.xz, h0;
| SCB1 | min | 19: MINh*8 h2.w, |h2.---x|, |h2.---z|;
| | |
7 | SCT0 | div | 20: DIVx h4.xy, h2.xz--, h2.ww--;
| SCT1 | mov | 21: MOVr r1.zw, g[TEX0].--xy;
| SCB1 | max | 22: MAXh h2.w, h0, h1;
| | |
8 | SCT0 | mad | 24: MADr r0.xy,-h2.xz--, const.zw--, r1.zw--;
| SCT1 | mov | 26: TXLr h0, r0, const.xxxx, TEX0;
| TEX | txl | 26: TXLr h0, r0, const.xxxx, TEX0;
| SCB0 | max | 28: MAXh h5.x, h2.w---, h5;
| SCB1 | min | 29: MINh h5.w, h5.---y, h4;
| | |
9 | SCT0 | mad | 30: MADr r1.xy, h2.xz--, const.zw--, r1.zw--;
| SCT1 | mov | 32: TXLr h2, r1, const.xxxx, TEX0;
| TEX | txl | 32: TXLr h2, r1, const.xxxx, TEX0;
| SCB0/1 | add | 34: ADDh/2 h2, h0, h2;
| | |
10 | SCT0/1 | mov | 35: TXLr h1, g[TEX0], const.xxxx, TEX0;
| TEX | txl | 35: TXLr h1, g[TEX0], const.xxxx, TEX0;
| SCB0 | max | 37: MAXh h5.y, h5.-x--, h1.-w--;
| SCB1 | min | 38: MINh h4.w, h1, h5;
| | |
11 | SCT0 | mad | 39: MADr r0.xy,-h4, const.xy--, r1.zw--;
| SCT1 | mov | 41: TXLr h0, r0, const.zzzz, TEX0;
| TEX | txl | 41: TXLr h0, r0, const.zzzz, TEX0;
| SCB0 | mad | 44: MADr r2.xy, h4, const.xy--, r1.zw--;
| SCB1 | add | 43: ADDh*8 h5.z, h5.--y-,-h4.--w-;
| | |
12 | SCT0/1 | mov | 46: TXLr h3, r2, const.xxxx, TEX0;
| TEX | txl | 46: TXLr h3, r2, const.xxxx, TEX0;
| SCB0/1 | add | 48: ADDh/2 h0, h0, h3;
| | |
13 | SCT0/1 | mad | 49: ADDh/2 h3, h0, h2;
| SCB0/1 | mul | 50: MOVh h0, h3;
| | |
14 | SCT0 | set | 51: SLTh h3.x, h3.w---, h5.w---;
| SCT1 | set | 52: SGTh h3.w, h3, h5.---x;
| SCB0 | set | 54: SLThc0 rc, h5.z---, h5;
| SCB1 | add | 53: ADDxc0_s rc, h3.---x, h3;
| | |
15 | SCT0/1 | mul | 55: MOVh h0(NE0.wwww), h2;
| SCB0/1 | mul | 56: MOVh h0(NE0.xxxx), h1;
Pass SCT TEX SCB
1: 0% 100% 25%
2: 0% 100% 25%
3: 0% 100% 50%
4: 0% 100% 50%
5: 50% 0% 25%
6: 0% 0% 25%
7: 100% 0% 25%
8: 0% 100% 50%
9: 0% 100% 100%
10: 0% 100% 50%
11: 0% 100% 75%
12: 0% 100% 100%
13: 100% 0% 100%
14: 50% 0% 50%
15: 100% 0% 100%
MEAN: 26% 60% 56%
Pass SCT0 SCT1 TEX SCB0 SCB1
1: 0% 0% 100% 100% 0%
2: 0% 0% 100% 100% 0%
3: 0% 0% 100% 100% 100%
4: 0% 0% 100% 100% 100%
5: 100% 100% 0% 100% 0%
6: 0% 0% 0% 0% 100%
7: 100% 100% 0% 0% 100%
8: 0% 0% 100% 100% 100%
9: 0% 0% 100% 100% 100%
10: 0% 0% 100% 100% 100%
11: 0% 0% 100% 100% 100%
12: 0% 0% 100% 100% 100%
13: 100% 100% 0% 100% 100%
14: 100% 100% 0% 100% 100%
15: 100% 100% 0% 100% 100%
MEAN: 33% 33% 60% 86% 80%
Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5
Results 15 cycles, 3 r regs, 800,000,000 pixels/s
============================================================================*/
#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 1)
/*--------------------------------------------------------------------------*/
#pragma disablepc all
#pragma option O2
#pragma option OutColorPrec=fp16
#pragma texformat default RGBA8
/*==========================================================================*/
half4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// {xy__} = upper left of pixel
// {__zw} = lower right of pixel
float4 posPos,
// {rgb_} = color in linear or perceptual color space
// {___a} = luma in perceptual color space (not linear)
sampler2D tex,
// This must be from a constant/uniform.
// {xy} = rcpFrame not used on PS3
float2 rcpFrame,
// This must be from a constant/uniform.
// {x___} = 2.0/screenWidthInPixels
// {_y__} = 2.0/screenHeightInPixels
// {__z_} = 0.5/screenWidthInPixels
// {___w} = 0.5/screenHeightInPixels
float4 rcpFrameOpt
) {
/*--------------------------------------------------------------------------*/
// (1)
half4 rgbyNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0));
half lumaNe = rgbyNe.w + half(1.0/512.0);
/*--------------------------------------------------------------------------*/
// (2)
half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0));
half lumaSwNegNe = lumaSw.w - lumaNe;
/*--------------------------------------------------------------------------*/
// (3)
half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0));
half lumaMaxNwSw = max(lumaNw.w, lumaSw.w);
half lumaMinNwSw = min(lumaNw.w, lumaSw.w);
/*--------------------------------------------------------------------------*/
// (4)
half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0));
half dirZ = lumaNw.w + lumaSwNegNe;
half dirX = -lumaNw.w + lumaSwNegNe;
/*--------------------------------------------------------------------------*/
// (5)
half3 dir;
dir.y = 0.0;
dir.x = lumaSe.w + dirX;
dir.z = -lumaSe.w + dirZ;
half lumaMinNeSe = min(lumaNe, lumaSe.w);
/*--------------------------------------------------------------------------*/
// (6)
half4 dir1_pos;
dir1_pos.xy = normalize(dir).xz;
half dirAbsMinTimes8 = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__EDGE_SHARPNESS);
/*--------------------------------------------------------------------------*/
// (7)
half4 dir2_pos;
dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimes8, half(-2.0), half(2.0));
dir1_pos.zw = pos.xy;
dir2_pos.zw = pos.xy;
half lumaMaxNeSe = max(lumaNe, lumaSe.w);
/*--------------------------------------------------------------------------*/
// (8)
half4 temp1N;
temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;
temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0));
half lumaMax = max(lumaMaxNwSw, lumaMaxNeSe);
half lumaMin = min(lumaMinNwSw, lumaMinNeSe);
/*--------------------------------------------------------------------------*/
// (9)
half4 rgby1;
rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;
rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0));
rgby1 = (temp1N + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
// (10)
half4 rgbyM = h4tex2Dlod(tex, half4(pos.xy, 0.0, 0.0));
half lumaMaxM = max(lumaMax, rgbyM.w);
half lumaMinM = min(lumaMin, rgbyM.w);
/*--------------------------------------------------------------------------*/
// (11)
half4 temp2N;
temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;
temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0));
half4 rgby2;
rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;
half lumaRangeM = (lumaMaxM - lumaMinM) / FXAA_CONSOLE__EDGE_THRESHOLD;
/*--------------------------------------------------------------------------*/
// (12)
rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0));
rgby2 = (temp2N + rgby2) * 0.5;
/*--------------------------------------------------------------------------*/
// (13)
rgby2 = (rgby2 + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
// (14)
bool twoTapLt = rgby2.w < lumaMin;
bool twoTapGt = rgby2.w > lumaMax;
bool earlyExit = lumaRangeM < lumaMax;
bool twoTap = twoTapLt || twoTapGt;
/*--------------------------------------------------------------------------*/
// (15)
if(twoTap) rgby2 = rgby1;
if(earlyExit) rgby2 = rgbyM;
/*--------------------------------------------------------------------------*/
return rgby2; }
/*==========================================================================*/
#endif
/*============================================================================
FXAA3 CONSOLE - PC PIXEL SHADER
------------------------------------------------------------------------------
Using a modified version of the PS3 version here to best target old hardware.
============================================================================*/
#if (FXAA_PC_CONSOLE == 1)
/*--------------------------------------------------------------------------*/
half4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// {xy__} = upper left of pixel
// {__zw} = lower right of pixel
float4 posPos,
// {rgb_} = color in linear or perceptual color space
// {___a} = alpha output is junk value
FxaaTex tex,
// This must be from a constant/uniform.
// {xy} = rcpFrame not used on PC version of FXAA Console
float2 rcpFrame,
// This must be from a constant/uniform.
// {x___} = 2.0/screenWidthInPixels
// {_y__} = 2.0/screenHeightInPixels
// {__z_} = 0.5/screenWidthInPixels
// {___w} = 0.5/screenHeightInPixels
float4 rcpFrameOpt
) {
/*--------------------------------------------------------------------------*/
half4 dir;
dir.y = 0.0;
half4 lumaNe = FxaaTexTop(tex, posPos.zy);
lumaNe.w += half(1.0/384.0);
dir.x = -lumaNe.w;
dir.z = -lumaNe.w;
/*--------------------------------------------------------------------------*/
half4 lumaSw = FxaaTexTop(tex, posPos.xw);
dir.x += lumaSw.w;
dir.z += lumaSw.w;
/*--------------------------------------------------------------------------*/
half4 lumaNw = FxaaTexTop(tex, posPos.xy);
dir.x -= lumaNw.w;
dir.z += lumaNw.w;
/*--------------------------------------------------------------------------*/
half4 lumaSe = FxaaTexTop(tex, posPos.zw);
dir.x += lumaSe.w;
dir.z -= lumaSe.w;
/*==========================================================================*/
#if (FXAA_EARLY_EXIT == 1)
half4 rgbyM = FxaaTexTop(tex, pos.xy);
/*--------------------------------------------------------------------------*/
half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));
half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));
/*--------------------------------------------------------------------------*/
half lumaMinM = min(lumaMin, rgbyM.w);
half lumaMaxM = max(lumaMax, rgbyM.w);
/*--------------------------------------------------------------------------*/
if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE__EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE__EDGE_THRESHOLD))
#if (FXAA_DISCARD == 1)
FxaaDiscard;
#else
return rgbyM;
#endif
#endif
/*==========================================================================*/
half4 dir1_pos;
dir1_pos.xy = normalize(dir.xyz).xz;
half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__EDGE_SHARPNESS);
/*--------------------------------------------------------------------------*/
half4 dir2_pos;
dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0));
dir1_pos.zw = pos.xy;
dir2_pos.zw = pos.xy;
half4 temp1N;
temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw;
/*--------------------------------------------------------------------------*/
temp1N = FxaaTexTop(tex, temp1N.xy);
half4 rgby1;
rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw;
/*--------------------------------------------------------------------------*/
rgby1 = FxaaTexTop(tex, rgby1.xy);
rgby1 = (temp1N + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
half4 temp2N;
temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy;
temp2N = FxaaTexTop(tex, temp2N.xy);
/*--------------------------------------------------------------------------*/
half4 rgby2;
rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy;
rgby2 = FxaaTexTop(tex, rgby2.xy);
rgby2 = (temp2N + rgby2) * 0.5;
/*--------------------------------------------------------------------------*/
#if (FXAA_EARLY_EXIT == 0)
half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w));
half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w));
#endif
rgby2 = (rgby2 + rgby1) * 0.5;
/*--------------------------------------------------------------------------*/
bool twoTapLt = rgby2.w < lumaMin;
bool twoTapGt = rgby2.w > lumaMax;
/*--------------------------------------------------------------------------*/
if(twoTapLt || twoTapGt) rgby2 = rgby1;
/*--------------------------------------------------------------------------*/
return rgby2; }
/*==========================================================================*/
#endif
/*============================================================================
FXAA3 QUALITY - PC
============================================================================*/
#if (FXAA_PC == 1)
/*--------------------------------------------------------------------------*/
float4 FxaaPixelShader(
// {xy} = center of pixel
float2 pos,
// {xyzw} = not used on FXAA3 Quality
float4 posPos,
// {rgb_} = color in linear or perceptual color space
// {___a} = luma in perceptual color space (not linear)
FxaaTex tex,
// This must be from a constant/uniform.
// {x_} = 1.0/screenWidthInPixels
// {_y} = 1.0/screenHeightInPixels
float2 rcpFrame,
// {xyzw} = not used on FXAA3 Quality
float4 rcpFrameOpt
) {
/*--------------------------------------------------------------------------*/
float2 posM;
posM.x = pos.x;
posM.y = pos.y;
#if (FXAA_GATHER4_ALPHA == 1)
#if (FXAA_DISCARD == 0)
float4 rgbyM = FxaaTexTop(tex, posM);
#define lumaM rgbyM.w
#endif
float4 luma4A = FxaaTexAlpha4(tex, posM, rcpFrame.xy);
float4 luma4B = FxaaTexOffAlpha4(tex, posM, FxaaInt2(-1, -1), rcpFrame.xy);
#if (FXAA_DISCARD == 1)
#define lumaM luma4A.w
#endif
#define lumaE luma4A.z
#define lumaS luma4A.x
#define lumaSE luma4A.y
#define lumaNW luma4B.w
#define lumaN luma4B.z
#define lumaW luma4B.x
#else
float4 rgbyM = FxaaTexTop(tex, posM);
#define lumaM rgbyM.w
float lumaS = FxaaTexOff(tex, posM, FxaaInt2( 0, 1), rcpFrame.xy).w;
float lumaE = FxaaTexOff(tex, posM, FxaaInt2( 1, 0), rcpFrame.xy).w;
float lumaN = FxaaTexOff(tex, posM, FxaaInt2( 0,-1), rcpFrame.xy).w;
float lumaW = FxaaTexOff(tex, posM, FxaaInt2(-1, 0), rcpFrame.xy).w;
#endif
/*--------------------------------------------------------------------------*/
float maxSM = max(lumaS, lumaM);
float minSM = min(lumaS, lumaM);
float maxESM = max(lumaE, maxSM);
float minESM = min(lumaE, minSM);
float maxWN = max(lumaN, lumaW);
float minWN = min(lumaN, lumaW);
float rangeMax = max(maxWN, maxESM);
float rangeMin = min(minWN, minESM);
float rangeMaxScaled = rangeMax * FXAA_QUALITY__EDGE_THRESHOLD;
float range = rangeMax - rangeMin;
float rangeMaxClamped = max(FXAA_QUALITY__EDGE_THRESHOLD_MIN, rangeMaxScaled);
bool earlyExit = range < rangeMaxClamped;
/*--------------------------------------------------------------------------*/
if(earlyExit)
#if (FXAA_DISCARD == 1)
FxaaDiscard;
#else
return rgbyM;
#endif
/*--------------------------------------------------------------------------*/
#if (FXAA_GATHER4_ALPHA == 0)
float lumaNW = FxaaTexOff(tex, posM, FxaaInt2(-1,-1), rcpFrame.xy).w;
float lumaSE = FxaaTexOff(tex, posM, FxaaInt2( 1, 1), rcpFrame.xy).w;
float lumaNE = FxaaTexOff(tex, posM, FxaaInt2( 1,-1), rcpFrame.xy).w;
float lumaSW = FxaaTexOff(tex, posM, FxaaInt2(-1, 1), rcpFrame.xy).w;
#else
float lumaNE = FxaaTexOff(tex, posM, FxaaInt2(1, -1), rcpFrame.xy).w;
float lumaSW = FxaaTexOff(tex, posM, FxaaInt2(-1, 1), rcpFrame.xy).w;
#endif
/*--------------------------------------------------------------------------*/
float lumaNS = lumaN + lumaS;
float lumaWE = lumaW + lumaE;
float subpixRcpRange = 1.0/range;
float subpixNSWE = lumaNS + lumaWE;
float edgeHorz1 = (-2.0 * lumaM) + lumaNS;
float edgeVert1 = (-2.0 * lumaM) + lumaWE;
/*--------------------------------------------------------------------------*/
float lumaNESE = lumaNE + lumaSE;
float lumaNWNE = lumaNW + lumaNE;
float edgeHorz2 = (-2.0 * lumaE) + lumaNESE;
float edgeVert2 = (-2.0 * lumaN) + lumaNWNE;
/*--------------------------------------------------------------------------*/
float lumaNWSW = lumaNW + lumaSW;
float lumaSWSE = lumaSW + lumaSE;
float edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2);
float edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2);
float edgeHorz3 = (-2.0 * lumaW) + lumaNWSW;
float edgeVert3 = (-2.0 * lumaS) + lumaSWSE;
float edgeHorz = abs(edgeHorz3) + edgeHorz4;
float edgeVert = abs(edgeVert3) + edgeVert4;
/*--------------------------------------------------------------------------*/
float subpixNWSWNESE = lumaNWSW + lumaNESE;
float lengthSign = rcpFrame.x;
bool horzSpan = edgeHorz >= edgeVert;
float subpixA = subpixNSWE * 2.0 + subpixNWSWNESE;
/*--------------------------------------------------------------------------*/
if(!horzSpan) lumaN = lumaW;
if(!horzSpan) lumaS = lumaE;
if(horzSpan) lengthSign = rcpFrame.y;
float subpixB = (subpixA * (1.0/12.0)) - lumaM;
/*--------------------------------------------------------------------------*/
float gradientN = lumaN - lumaM;
float gradientS = lumaS - lumaM;
float lumaNN = lumaN + lumaM;
float lumaSS = lumaS + lumaM;
bool pairN = abs(gradientN) >= abs(gradientS);
float gradient = max(abs(gradientN), abs(gradientS));
if(pairN) lengthSign = -lengthSign;
float subpixC = FxaaSat(abs(subpixB) * subpixRcpRange);
/*--------------------------------------------------------------------------*/
float2 posB;
posB.x = posM.x;
posB.y = posM.y;
float2 offNP;
offNP.x = (!horzSpan) ? 0.0 : rcpFrame.x;
offNP.y = ( horzSpan) ? 0.0 : rcpFrame.y;
if(!horzSpan) posB.x += lengthSign * 0.5;
if( horzSpan) posB.y += lengthSign * 0.5;
/*--------------------------------------------------------------------------*/
float2 posN;
posN.x = posB.x - offNP.x * FXAA_QUALITY__P0;
posN.y = posB.y - offNP.y * FXAA_QUALITY__P0;
float2 posP;
posP.x = posB.x + offNP.x * FXAA_QUALITY__P0;
posP.y = posB.y + offNP.y * FXAA_QUALITY__P0;
float subpixD = ((-2.0)*subpixC) + 3.0;
float lumaEndN = FxaaTexTop(tex, posN).w;
float subpixE = subpixC * subpixC;
float lumaEndP = FxaaTexTop(tex, posP).w;
/*--------------------------------------------------------------------------*/
if(!pairN) lumaNN = lumaSS;
float gradientScaled = gradient * 1.0/4.0;
float lumaMM = lumaM - lumaNN * 0.5;
float subpixF = subpixD * subpixE;
bool lumaMLTZero = lumaMM < 0.0;
/*--------------------------------------------------------------------------*/
lumaEndN -= lumaNN * 0.5;
lumaEndP -= lumaNN * 0.5;
bool doneN = abs(lumaEndN) >= gradientScaled;
bool doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P1;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P1;
bool doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P1;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P1;
/*--------------------------------------------------------------------------*/
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P2;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P2;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P2;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P2;
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PS > 3)
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P3;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P3;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P3;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P3;
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PS > 4)
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P4;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P4;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P4;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P4;
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PS > 5)
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P5;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P5;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P5;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P5;
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PS > 6)
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P6;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P6;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P6;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P6;
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PS > 7)
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P7;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P7;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P7;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P7;
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PS > 8)
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P8;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P8;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P8;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P8;
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PS > 9)
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P9;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P9;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P9;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P9;
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PS > 10)
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P10;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P10;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P10;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P10;
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PS > 11)
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P11;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P11;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P11;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P11;
/*--------------------------------------------------------------------------*/
#if (FXAA_QUALITY__PS > 12)
if(doneNP) {
if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w;
if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w;
if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
doneN = abs(lumaEndN) >= gradientScaled;
doneP = abs(lumaEndP) >= gradientScaled;
if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P12;
if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P12;
doneNP = (!doneN) || (!doneP);
if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P12;
if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P12;
/*--------------------------------------------------------------------------*/
}
#endif
/*--------------------------------------------------------------------------*/
}
#endif
/*--------------------------------------------------------------------------*/
}
#endif
/*--------------------------------------------------------------------------*/
}
#endif
/*--------------------------------------------------------------------------*/
}
#endif
/*--------------------------------------------------------------------------*/
}
#endif
/*--------------------------------------------------------------------------*/
}
#endif
/*--------------------------------------------------------------------------*/
}
#endif
/*--------------------------------------------------------------------------*/
}
#endif
/*--------------------------------------------------------------------------*/
}
#endif
/*--------------------------------------------------------------------------*/
}
/*--------------------------------------------------------------------------*/
float dstN = posM.x - posN.x;
float dstP = posP.x - posM.x;
if(!horzSpan) dstN = posM.y - posN.y;
if(!horzSpan) dstP = posP.y - posM.y;
/*--------------------------------------------------------------------------*/
bool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero;
float spanLength = (dstP + dstN);
bool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero;
float spanLengthRcp = 1.0/spanLength;
/*--------------------------------------------------------------------------*/
bool directionN = dstN < dstP;
float dst = min(dstN, dstP);
bool goodSpan = directionN ? goodSpanN : goodSpanP;
float subpixG = subpixF * subpixF;
float pixelOffset = (dst * (-spanLengthRcp)) + 0.5;
float subpixH = subpixG * FXAA_QUALITY__SUBPIX;
/*--------------------------------------------------------------------------*/
float pixelOffsetGood = goodSpan ? pixelOffset : 0.0;
float pixelOffsetSubpix = max(pixelOffsetGood, subpixH);
if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign;
if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign;
#if (FXAA_DISCARD == 1)
return FxaaTexTop(tex, posM);
#else
return float4(FxaaTexTop(tex, posM).xyz, lumaM);
#endif
}
/*==========================================================================*/
#endif
PS_OUTPUT ps_main(PS_INPUT input)
{
PS_OUTPUT output;
float2 pos = input.t;
float4 posPos = (float4)0;
FxaaTex tex;
#if SHADER_MODEL >= 0x400
tex.tex = Texture;
tex.smpl = TextureSampler;
#else
tex = Texture;
#endif
output.c = FxaaPixelShader(pos, posPos, tex, _rcpFrame.xy, _rcpFrameOpt);
return output;
}
#endif